################################## Covid-19 vs. Real estate market #########################################
#
# Authors:
#
# Sandro Heiniger
# Swiss Institute for Empirical Economic Research
# University St. Gallen
# sandro.heiniger@unisg.ch
#
#### 



# Settings ----------------------------------------------------------------
setwd("set_path_here")


library(install.load)
install_load("tidyverse","readxl","reshape2","ggplot2","scales","GGally","psych",
             "stringi","Hmisc","plotrix","maps","sf","rnaturalearth","rnaturalearthdata","viridis","rgdal")

# General functions ----------------------------------------------------------------

replace_WZ=function(data, column){
  column_string=enquo(column)
  data %>% rename(old_column=column) %>% mutate(REPLACE_COL=case_when(
    old_column=="WZ_A" ~ "WZ_A : Land- und Forstwirtschaft, Fischerei", 
    old_column=="WZ_B" ~ "WZ_B : Bergbau und Gewinnung von Steinen und Erden", 
    old_column=="WZ_C" ~ "WZ_C : Verarbeitendes Gewerbe",
    old_column=="WZ_D" ~ "WZ_D : Energieversorgung",
    old_column=="WZ_E" ~ "WZ_E : Wasserversorg.,Entsorg.,Beseitig.v.Umweltverschm.",
    old_column=="WZ_F" ~ "WZ_F : Baugewerbe",
    old_column=="WZ_G" ~ "WZ_G : Handel, Instandhaltung und Reparatur von Kfz",
    old_column=="WZ_H" ~ "WZ_H : Verkehr und Lagerei",
    old_column=="WZ_I" ~ "WZ_I : Gastgewerbe",
    old_column=="WZ_J" ~ "WZ_J : Information und Kommunikation",
    old_column=="WZ_K" ~ "WZ_K : Erbringung von Finanz- und Versicherungsleistungen",
    old_column=="WZ_L" ~ "WZ_L : Grundstücks- und Wohnungswesen",
    old_column=="WZ_M" ~ "WZ_M : Freiberufliche, wiss. u. techn. Dienstleistungen",
    old_column=="WZ_N" ~ "WZ_N : Sonstige wirtschaftliche Dienstleistungen",
    old_column=="WZ_O" ~ "WZ_O : Öff. Verwaltung, Verteidigung, Sozialversicherung",
    old_column=="WZ_P" ~ "WZ_P : Erziehung und Unterricht",
    old_column=="WZ_Q" ~ "WZ_Q : Gesundheits- und Sozialwesen",
    old_column=="WZ_R" ~ "WZ_R : Kunst, Unterhaltung und Erholung",
    old_column=="WZ_S" ~ "WZ_S : Erbringung von sonstigen Dienstleistungen",
    old_column=="WZ_T" ~ "WZ_T : Private Haushalte",
    old_column=="WZ_U" ~ "WZ_U : Exterritoriale Organisationen und Körperschaften")) %>% select(-old_column) %>% rename(!!column:=REPLACE_COL)
}

# For maps
theme_map=ggplot() + 
  theme_void(base_size = 12)+ 
  theme(plot.background = element_rect(fill = "white", color = "white",linetype = "blank"),
        legend.position = c(0.9,0.5),
  	  plot.margin = margin(0,0,0,0,"cm"))+
  xlim(c(3300000,4090000))+
  ylim(c(5250000,6080000))+
  scale_size(guide = 'none')

worldmap <- ne_countries(scale = 'medium', type = 'map_units',returnclass = 'sf')
germany <- worldmap[worldmap$name == 'Germany',]

shapefile=st_read( "./Daten/Shapefile/vg250_ebenen_0101/VG250_GEM.shp")
shapefile=left_join(shapefile,
                    ags_book %>% select(ags_8, ags_9, ags_2,Population),
                    by=c("AGS"="ags_8")) %>% na.omit()

shapefile[,"ABR"]=NA
shapefile[shapefile$GF==4 & shapefile$GEN=="Hamburg","ABR"]="HAM"
shapefile[shapefile$GF==4 & shapefile$GEN=="Köln","ABR"]="COL"
shapefile[shapefile$GF==4 & shapefile$GEN=="Frankfurt am Main","ABR"]="FRA"
shapefile[shapefile$GF==4 & shapefile$GEN=="München","ABR"]="MUN"
shapefile[shapefile$GF==4 & shapefile$GEN=="Berlin","ABR"]="BER"

plot_type="jpeg"

# Create municipality data set --------------------------------------------

# AGS book
load_ags_book=function(){
  # The latest Gemeindeverzeichnis
  ags_book_import=suppressMessages(read_excel(path = "./Daten/Gemeindeverzeichnis/AuszugGV4QAktuell.xlsx",
                                              sheet = "Onlineprodukt_Gemeinden",trim_ws = T))
  colnames(ags_book_import)= paste0("import_column_",1:ncol(ags_book_import))
  
  ags_book=suppressWarnings(ags_book_import %>% 
                              filter(!is.na(import_column_7) & !is.na(as.numeric(import_column_7))) %>% 
                              mutate(municipality=import_column_8,
                                     ags_9=paste0(import_column_3, import_column_4, import_column_5, import_column_6),
                                     ags_8=paste0(import_column_3, import_column_4, import_column_5, import_column_7),
                                     ags_12=paste0(import_column_3, import_column_4, import_column_5, import_column_6, import_column_7),
                                     ags_5=paste0(import_column_3, import_column_4, import_column_5),
                                     ags_2=import_column_3,
                                     Population=replace_na(as.numeric(import_column_10,0))) %>%
                              select(-starts_with("import_column")))
  ags_book=suppressWarnings(left_join(left_join(ags_book, 
                                                ags_book_import %>% 
                                                  filter(!is.na(import_column_6) & is.na(import_column_7) & !is.na(as.numeric(import_column_6))) %>%
                                                  mutate(ags_9=paste0(import_column_3, import_column_4, import_column_5, import_column_6),
                                                         municipality_vb=import_column_8) %>% 
                                                  select(-starts_with("import_column")),
                                                by="ags_9"),
                                      ags_book_import %>% filter(!is.na(import_column_3) & import_column_1=="10") %>% 
                                        mutate(ags_2=import_column_3,
                                               Bundesland=import_column_8) %>%
                                        select(ags_2, Bundesland),
                                      by="ags_2"))
  
  return(ags_book)
}
ags_book=load_ags_book()

# Gebietsänderungen

load_gebietsaenderung=function(){
  gebietsaenderung=data.frame()
  for(current_year in 2011:2020){
    gebietsaenderung_current_year=suppressMessages(read_excel(path = paste0("./Daten/Gebietsänderungen/",current_year,".xlsx"),
                                                              sheet = paste0("Gebietsaenderungen ",current_year),trim_ws = T))
    gebietsaenderung_current_year=gebietsaenderung_current_year[replace_na(gebietsaenderung_current_year[,3]!=gebietsaenderung_current_year[,9],FALSE),c(3,4,6,7,8,9,10)]
    colnames(gebietsaenderung_current_year)=c("ags_12_before","ags_8_before","change_type","area","inhabitants","ags_12_after","ags_8_after")
    gebietsaenderung=suppressWarnings(bind_rows(gebietsaenderung,
                                                gebietsaenderung_current_year %>% 
                                                  filter(!is.na(as.numeric(ags_12_after))) %>% 
                                                  filter(substr(change_type,1,1) %in% c("1","3")) %>% 
                                                  select(-change_type) %>%
                                                  mutate(change_year=current_year)))
  }
  gebietsaenderung=left_join(left_join(gebietsaenderung,
                                       gebietsaenderung %>% select(ags_12_before, ags_8_before, ags_12_after, ags_8_after, change_year), 
                                       by=c("ags_12_after"="ags_12_before","ags_8_after"="ags_8_before")) %>% 
                               rowwise() %>%
                               mutate(ags_12_after.y=ifelse(isTRUE(change_year.y>change_year.x),ags_12_after.y,ags_12_after),
                                      ags_8_after.y=ifelse(isTRUE(change_year.y>change_year.x),ags_8_after.y,ags_8_after),
                                      change_year=max(change_year.x,change_year.y, na.rm = T)) %>%
                               mutate(ags_12_after=ifelse(is.na(ags_12_after.y),ags_12_after,ags_12_after.y),
                                      ags_8_after=ifelse(is.na(ags_8_after.y),ags_8_after,ags_8_after.y)) %>% 
                               select(-ags_12_after.y, -ags_8_after.y, -change_year.x, -change_year.y) %>% 
                               filter(isTRUE(ags_12_before!=ags_12_after) | isTRUE(ags_8_before!=ags_8_after)) %>% 
                               filter(str_length(ags_12_before) %in% c(9,12)) %>% 
                               mutate(ags_9_before=substr(ags_12_before,1,9),
                                      ags_9_after=substr(ags_12_after,1,9),
                                      ags_12_before=ifelse(str_length(ags_12_before)==12,ags_12_before,NA),
                                      ags_12_after=ifelse(str_length(ags_12_after)==12,ags_12_after,NA)) %>% 
                               mutate_at(vars("area","inhabitants"),list(~suppressWarnings(as.numeric(.)))) %>%
                               group_by(ags_12_before, ags_9_before, ags_8_before) %>%
                               mutate(area_total=sum(area, na.rm = T), inhabitants_total=sum(inhabitants, na.rm = T)) %>% 
                               ungroup %>% 
                               mutate(multiplier=ifelse(inhabitants_total==0,ifelse(area_total==0,1,area/area_total),inhabitants/inhabitants_total)) %>%
                               select(-area, -inhabitants, -area_total, -inhabitants_total) %>% 
                               unique(),
                             gebietsaenderung %>% select(ags_12_before, ags_8_before, ags_12_after, ags_8_after,change_year), 
                             by=c("ags_12_after"="ags_12_before","ags_8_after"="ags_8_before")) %>% 
    rowwise() %>%
    mutate(ags_12_after.y=ifelse(isTRUE(change_year.y>change_year.x),ags_12_after.y,ags_12_after),
           ags_8_after.y=ifelse(isTRUE(change_year.y>change_year.x),ags_8_after.y,ags_8_after),
           change_year=max(change_year.x,change_year.y, na.rm = T)) %>% 
    mutate(ags_12_after=ifelse(is.na(ags_12_after.y),ags_12_after,ags_12_after.y),
           ags_9_after=ifelse(is.na(ags_12_after.y),ags_9_after,substr(ags_12_after,1,9)),
           ags_8_after=ifelse(is.na(ags_8_after.y),ags_8_after,ags_8_after.y)) %>% 
    select(-ags_12_after.y, -ags_8_after.y, -change_year.x, -change_year.y) 
  
  gebietsaenderung=left_join(gebietsaenderung,
                             gebietsaenderung %>% filter(is.na(ags_12_before)) %>% 
                               select(ags_9_after, ags_9_before, change_year),
                             by=c("ags_9_after"="ags_9_before")) %>% 
    rowwise() %>%
    mutate(ags_9_after.y=ifelse(isTRUE(change_year.y>change_year.x),ags_9_after.y,ags_9_after),
           change_year=max(change_year.x,change_year.y, na.rm = T)) %>%
    mutate(ags_9_after=ifelse(is.na(ags_9_after.y),ags_9_after,ags_9_after.y),
           ags_12_after=ifelse(is.na(ags_12_after),NA,paste0(ags_9_after, substr(ags_12_after,10,12))),
           ags_8_after=ifelse(is.na(ags_12_after),NA,paste0(substr(ags_9_after,1,5), substr(ags_12_after,10,12)))) %>%
    select(-ags_9_after.y, -change_year.x, -change_year.y) %>% 
    ungroup()
  
  # drop changes that have been reverted
  
  to_drop=left_join(gebietsaenderung,gebietsaenderung %>% filter(is.na(ags_12_before)) %>% select(ags_9_before, ags_9_after, change_year),by=c("ags_9_after"="ags_9_before")) %>%
    filter(change_year.x>change_year.y) %>% 
    select(ags_9_after, ags_9_after.y) %>% 
    distinct()
  gebietsaenderung=gebietsaenderung %>% 
    filter(!(ags_9_before %in% to_drop$ags_9_after & ags_9_after %in% to_drop$ags_9_after.y)) %>%
    filter(!(ags_9_before=="150860140" & ags_9_after=="150860040"))
  
  
  return(gebietsaenderung)
}
gebietsaenderung=load_gebietsaenderung()

# Arbeitsmarkt data 

load_arbeitsmarkt_LK_data=function(){
  
  # path to files
  path_to_arbeitsmarkt_LK="./Daten/Arbeitsmarkt_landkreis/"
  arbeitsmarkt_LK_file_list <- list.files(path=path_to_arbeitsmarkt_LK)
  
  arbeitsmarkt_LK_data=data.frame()
  
  for(current_file_iterator in 1:length(arbeitsmarkt_LK_file_list)){
    
    current_file=arbeitsmarkt_LK_file_list[current_file_iterator]
    
    # skip undesired files
    if(substr(current_file,start=1, stop = 8)!="bst-reg-"){next}
    
    LK_import=suppressMessages(read_excel(path = paste0(path_to_arbeitsmarkt_LK,current_file),
                                          sheet = "Tabelle 2.3",trim_ws = T))
    
    colnames(LK_import) = paste0("import_column_",1:ncol(LK_import))
    
    data_row=which(LK_import$import_column_1=="Insgesamt")
    name_row=which(substr(LK_import$import_column_1,1,8)=="Stichtag")
    length_name_row=str_length(LK_import$import_column_1[name_row])
    
    arbeitsmarkt_LK_data[current_file_iterator,1:19]=c(
      strsplit(LK_import$import_column_1[name_row-1], '[()]')[[1]][length(strsplit(LK_import$import_column_1[name_row-1], '[()]')[[1]])-1],
      trimws(substr(LK_import$import_column_1[name_row],length_name_row-4,length_name_row)),
      as.numeric(LK_import$import_column_4[data_row+c(1:3,7:20)]))
  }
  arbeitsmarkt_LK_data=arbeitsmarkt_LK_data %>% filter_all(any_vars(!is.na(.))) 
  
  colnames(arbeitsmarkt_LK_data)=c(
    "ags_5",
    "year",
    "WZ__A",
    "WZ__B_D_E",
    "WZ__C",
    "WZ__F",
    "WZ__G",
    "WZ__H",
    "WZ__I",
    "WZ__J",
    "WZ__K",
    "WZ__L_M",
    "WZ__N_1",
    "WZ__N_2",
    "WZ__O_U",
    "WZ__P",
    "WZ__86",
    "WZ__87",
    "WZ__R_S_T")
  
  arbeitsmarkt_LK_data = arbeitsmarkt_LK_data %>% 
    mutate_at(2:19, as.numeric)
  
  arbeitsmarkt_LK_data=arbeitsmarkt_LK_data %>%
    mutate(WZ__N=WZ__N_1+WZ__N_2,
           WZ__Q=WZ__86+WZ__87)
  
  return(arbeitsmarkt_LK_data %>% ungroup())
}

arbeitsmarkt_LK_data=load_arbeitsmarkt_LK_data()

load_arbeitsmarkt_data=function(ags_book, gebietsaenderung){
  
  # path to files
  path_to_arbeitsmarkt="./Daten/Arbeitsmarkt_kommunal/"
  arbeitsmarkt_file_list <- list.files(path=path_to_arbeitsmarkt)
  
  years_and_columns=data.frame("Year"=2016:2020, "Year_column"=3:7)
  
  arbeitsmarkt_data=data.frame()
  
  for(current_file_iterator in 1:length(arbeitsmarkt_file_list)){
    # skip undesired files
    current_file=arbeitsmarkt_file_list[current_file_iterator]
    if(substr(current_file,start=1, stop = 21)!="Arbeitsmarkt-kommunal"){next}
    
    municipality_import=suppressMessages(read_excel(path = paste0(path_to_arbeitsmarkt,current_file),
                                                    sheet = "Daten",trim_ws = T, col_names = paste0("import_column_",1:7)))
    
    start_row=which(municipality_import$import_column_1=="2020")-1
    
    for(year_iterator in 1:nrow(years_and_columns)){
      
      current_year=years_and_columns$Year[year_iterator]
      current_column=years_and_columns$Year_column[year_iterator]
      
      current_municipality=colsplit(strsplit(x = municipality_import$import_column_1[start_row],split = ',')[[1]][1]," ",c("ags","municipality"))
      # replace, because col_split does convert ags to numeric automatically 
      current_municipality$ags=strsplit(strsplit(x = municipality_import$import_column_1[start_row],split = ',')[[1]][1],' ')[[1]][1]
      
      current_municipality$year=current_year
      current_municipality[,c("empl_wp_total",
                              "empl_wp_share_male",
                              "empl_wp_share_female",
                              "empl_wp_share_foreign",
                              "empl_wp_share_u25",
                              "empl_wp_share_o55",
                              "empl_wp_share_commute_in")]=
        suppressWarnings(as.numeric(pull(municipality_import[start_row+c(6:12
                                                                         #,14:17
        ),current_column])))
      # (DEPRECATED)
      # "ft_empl_wp_share_WZ_A",
      # "ft_empl_wp_share_WZ_BF",
      # "ft_empl_wp_share_WZ_GI",
      # "ft_empl_wp_share_WZ_JU")
      
      
      current_municipality[,c("empl_res_total",
                              "empl_res_share_male",
                              "empl_res_share_female",
                              "empl_res_share_foreign",
                              "empl_res_share_u25",
                              "empl_res_share_o55",
                              "empl_res_share_commute_out")]=
        suppressWarnings(as.numeric(pull(municipality_import[start_row+19:25,current_column])))
      
      current_municipality[,c("minijob_wp_total",
                              "minijob_wp_share_male",
                              "minijob_wp_share_female",
                              "minijob_wp_share_foreign",
                              "minijob_wp_share_2nd_job")]=
        suppressWarnings(as.numeric(pull(municipality_import[start_row+c(27:30,32),current_column])))
      
      current_municipality[,c("unempl_res_total",
                              "unempl_res_share_male",
                              "unempl_res_share_female",
                              "unempl_res_share_foreign",
                              "unempl_res_share_u25",
                              "unempl_res_share_o55",
                              "unempl_res_share_long_time",
                              "unempl_res_share_SGBIII",
                              "unempl_res_share_SGBII")]=suppressWarnings(as.numeric(pull(municipality_import[start_row+34:42,current_column])))
      
      arbeitsmarkt_data=bind_rows(arbeitsmarkt_data, current_municipality)
    }
  }
  
  arbeitsmarkt_data=left_join(left_join(arbeitsmarkt_data %>% filter(str_length(ags)==8),
                                        gebietsaenderung %>% 
                                          filter(change_year>=current_year) %>%
                                          select(ags_8_before, ags_9_after, multiplier) %>% 
                                          filter(!is.na(ags_8_before)),
                                        by=c("ags"="ags_8_before")),
                              ags_book %>% select(ags_8, ags_9),
                              by=c("ags"="ags_8")) %>% 
    mutate(ags_9=ifelse(is.na(ags_9_after),ags_9,ags_9_after),
           multiplier=ifelse(is.na(multiplier),1,multiplier)) %>% 
    select(-municipality, -ags_9_after, -ags) %>%
    group_by(ags_9, year) %>% 
    replace(is.na(.), 0) %>%
    mutate(across(starts_with("empl_wp_share"),~(.x*multiplier) / sum(empl_wp_total*multiplier, na.rm = T)),
           across(starts_with("empl_res_share"),~(.x*multiplier) / sum(empl_res_total*multiplier, na.rm = T)),
           across(starts_with("minijob_wp_share"),~(.x*multiplier) / sum(minijob_wp_total*multiplier, na.rm = T)),
           across(starts_with("unempl_res_share"),~(.x*multiplier) / sum(unempl_res_total*multiplier, na.rm = T))) %>%
    ungroup() %>% group_by(ags_9,year) %>%
    summarise_all(sum, na.rm=T) %>% 
    select(-multiplier) %>% ungroup() %>% 
    mutate(unempl_div_empl_res=unempl_res_total/empl_res_total)
  
  return(arbeitsmarkt_data)
}

arbeitsmarkt_data=load_arbeitsmarkt_data(ags_book=ags_book, 
                                         gebietsaenderung=gebietsaenderung)

# Gemeindeverzeichnis

load_gvz_data=function(ags_book, gebietsaenderung){
  
  path_to_gemeindeverzeichnis="./Daten/Gemeindeverzeichnis/"
  
  years_and_files=data.frame("year"=2016:2020,
                             "path_to_file"=c("31122016_Auszug_GV.xlsx",
                                              "31122017_Auszug_GV.xlsx",
                                              "31122018_Auszug_GV.xlsx",
                                              "31122019_Auszug_GV.xlsx",
                                              "AuszugGV4QAktuell.xlsx"),
                             "worksheet"=c("Onlineprodukt_Gemeinden_311216",
                                           "Onlineprodukt_Gemeinden_311217",
                                           "Onlineprodukt_Gemeinden_311218",
                                           "Onlineprodukt_Gemeinden_311219",
                                           "Onlineprodukt_Gemeinden"))
  
  gvz_data=data.frame()
  for(year_iterator in 1:nrow(years_and_files)){
    current_year=years_and_files$year[year_iterator]
    current_file=years_and_files$path_to_file[year_iterator]
    current_worksheet=years_and_files$worksheet[year_iterator]
    
    gvz_import=read_excel(path = paste0(path_to_gemeindeverzeichnis,current_file),
                          sheet = current_worksheet,trim_ws = T, col_names = paste0("import_column_",1:20))
    
    gvz_import_cln= gvz_import %>% mutate(ags_9=paste0(import_column_3, import_column_4, import_column_5, import_column_6),
                                          ags_12=paste0(import_column_3, import_column_4, import_column_5, import_column_6, import_column_7))
    
    # join gebietsänderung information
    gvz_import_cln=left_join(left_join(gvz_import_cln,
                                       gebietsaenderung %>% 
                                         filter(change_year>=current_year) %>%
                                         select(ags_12_before, ags_9_after, multiplier) %>% 
                                         filter(!is.na(ags_12_before)),
                                       by=c("ags_12"="ags_12_before")) %>%
                               mutate(ags_9=ifelse(is.na(ags_9_after), ags_9, ags_9_after)) %>%
                               select(-ags_9_after),
                             gebietsaenderung %>% 
                               filter(change_year>=current_year) %>%
                               filter(is.na(ags_12_before)) %>% 
                               select(ags_9_before, ags_9_after, multiplier),
                             by=c("ags_9"="ags_9_before")) %>%
      mutate(ags_9=ifelse(is.na(ags_9_after), ags_9, ags_9_after),
             multiplier=ifelse(is.na(multiplier.x),ifelse(is.na(multiplier.y),1,multiplier.y),multiplier.x)) %>%
      select(-multiplier.x, -multiplier.y)
    
    # convert columns to numeric
    gvz_data=suppressWarnings(bind_rows(gvz_data,
                                        gvz_import_cln %>% 
                                          filter(as.numeric(import_column_1)>=60) %>%
                                          mutate(import_column_15=gsub(",",".",import_column_15),import_column_16=gsub(",",".",import_column_16)) %>% 
                                          mutate_at(vars(paste0("import_column_",9:16)),funs(as.numeric(.)*multiplier)) %>% 
                                          group_by(ags_9) %>% 
                                          summarise(area=sum(import_column_9, na.rm=T), 
                                                    population_total=sum(import_column_10, na.rm=T),
                                                    population_share_male=ifelse(population_total==0,0,sum(import_column_11, na.rm=T)/population_total),
                                                    population_share_female=ifelse(population_total==0,0,sum(import_column_12, na.rm=T)/population_total),
                                                    population_density=sum(import_column_10, na.rm=T)/area,
                                                    latitude=sum(import_column_9*import_column_16, na.rm=T)/area,
                                                    longitude=sum(import_column_9*import_column_15, na.rm=T)/area,
                                                    urbanization=min(as.numeric(import_column_19), na.rm=T),
                                                    .groups="drop") %>% 
                                          mutate(year=current_year,
                                                 population_density_bin=1+(population_density>quantile(population_density, 0.1))+
                                                   (population_density>quantile(population_density, 0.9))) %>% 
                                          select(ags_9, year, area, population_total, population_share_male, population_share_female, population_density, population_density_bin, latitude, longitude, urbanization)))
    
  }
  return(gvz_data)
}

gvz_data=load_gvz_data(ags_book=ags_book, 
                       gebietsaenderung=gebietsaenderung)

# Zensus 2011

load_zensus_data=function(ags_book, gebietsaenderung){
  path_to_zensus="./Daten/Zensus_2011/"
  
  zensus_pop_import=read_excel(path = paste0(path_to_zensus,"xlsx_Bevoelkerung.xlsx"),
                               sheet = "Bevölkerung",trim_ws = T, col_names = paste0("import_column_",1:223))
  
  zensus_pop_data= left_join(left_join(zensus_pop_import %>% 
                                         filter(import_column_8=="Gemeinde") %>%
                                         mutate(ags_9=paste0(import_column_2,import_column_3,import_column_4,import_column_5)) %>% 
                                         rename(ags_12=import_column_1),
                                       gebietsaenderung %>% 
                                         select(ags_12_before, ags_9_after, multiplier) %>% 
                                         filter(!is.na(ags_12_before)),
                                       by=c("ags_12"="ags_12_before")) %>%
                               mutate(ags_9=ifelse(is.na(ags_9_after), ags_9, ags_9_after)) %>%
                               select(-ags_9_after),
                             gebietsaenderung %>% 
                               filter(is.na(ags_12_before)) %>% 
                               select(ags_9_before, ags_9_after, multiplier),
                             by=c("ags_9"="ags_9_before")) %>%
    mutate(ags_9=ifelse(is.na(ags_9_after), ags_9, ags_9_after),
           multiplier=ifelse(is.na(multiplier.x),ifelse(is.na(multiplier.y),1,multiplier.y),multiplier.x)) %>%
    select(-multiplier.x, -multiplier.y) %>% 
    mutate_at(vars(paste0("import_column_",9:223)),funs(suppressWarnings(as.numeric(.)*multiplier))) %>% 
    group_by(ags_9) %>%
    summarise(pop_total=sum(import_column_10, na.rm = T),
              pop_share_male=ifelse(pop_total==0,0,sum(import_column_11, na.rm = T)/pop_total),
              pop_share_male=ifelse(pop_total==0,0,sum(import_column_12, na.rm = T)/pop_total),
              pop_share_single=ifelse(pop_total==0,0,sum(import_column_16, na.rm = T)/pop_total),
              pop_share_married=ifelse(pop_total==0,0,sum(import_column_19, na.rm = T)/pop_total),
              pop_share_widowed=ifelse(pop_total==0,0,sum(import_column_22, na.rm = T)/pop_total),
              pop_share_divorced=ifelse(pop_total==0,0,sum(import_column_25, na.rm = T)/pop_total),
              pop_share_reg_partner=ifelse(pop_total==0,0,sum(import_column_28, na.rm = T)/pop_total),
              pop_share_age_0_9=ifelse(pop_total==0,0,sum(import_column_43, na.rm = T)/pop_total),
              pop_share_age_10_19=ifelse(pop_total==0,0,sum(import_column_46, na.rm = T)/pop_total),
              pop_share_age_20_29=ifelse(pop_total==0,0,sum(import_column_49, na.rm = T)/pop_total),
              pop_share_age_30_39=ifelse(pop_total==0,0,sum(import_column_52, na.rm = T)/pop_total),
              pop_share_age_40_49=ifelse(pop_total==0,0,sum(import_column_55, na.rm = T)/pop_total),
              pop_share_age_50_59=ifelse(pop_total==0,0,sum(import_column_58, na.rm = T)/pop_total),
              pop_share_age_60_69=ifelse(pop_total==0,0,sum(import_column_61, na.rm = T)/pop_total),
              pop_share_age_70_79=ifelse(pop_total==0,0,sum(import_column_64, na.rm = T)/pop_total),
              pop_share_age_80_plus=ifelse(pop_total==0,0,sum(import_column_67, na.rm = T)/pop_total),
              pop_share_citizen_germany=ifelse(pop_total==0,0,sum(import_column_107, na.rm = T)/pop_total),
              pop_share_citizen_foreign=ifelse(pop_total==0,0,sum(import_column_108, na.rm = T)/pop_total),
              pop_share_citizen_EU27=ifelse(pop_total==0,0,sum(import_column_109, na.rm = T)/pop_total),
              pop_share_citizen_other_EU=ifelse(pop_total==0,0,sum(import_column_110, na.rm = T)/pop_total),
              pop_share_citizen_non_EU=ifelse(pop_total==0,0,sum(import_column_111,import_column_112, na.rm = T)/pop_total),
              pop_share_native_germany=ifelse(pop_total==0,0,sum(import_column_114, na.rm = T)/pop_total),
              pop_share_native_foreign=ifelse(pop_total==0,0,sum(import_column_115, na.rm = T)/pop_total),
              pop_share_native_EU27=ifelse(pop_total==0,0,sum(import_column_116, na.rm = T)/pop_total),
              pop_share_native_other_EU=ifelse(pop_total==0,0,sum(import_column_117, na.rm = T)/pop_total),
              pop_share_native_non_EU=ifelse(pop_total==0,0,sum(import_column_118, import_column_119, na.rm = T)/pop_total),
              pop_share_catholic=ifelse(pop_total==0,0,sum(import_column_121, na.rm = T)/pop_total),
              pop_share_protestant=ifelse(pop_total==0,0,sum(import_column_122, na.rm = T)/pop_total),
              pop_share_other_religion=ifelse(pop_total==0,0,sum(import_column_123, na.rm = T)/pop_total),
              .groups="drop") %>% 
    select(-starts_with("import_column"))
  
  zensus_res_import=read_excel(path = paste0(path_to_zensus,"xlsx_GebaudeWohnungen.xlsx"),
                               sheet = "Gebäude und Wohnungen",trim_ws = T, col_names = paste0("import_column_",1:132))
  
  zensus_res_data= left_join(left_join(zensus_res_import %>% 
                                         filter(import_column_8=="Gemeinde") %>%
                                         mutate(ags_9=paste0(import_column_2,import_column_3,import_column_4,import_column_5)) %>% 
                                         rename(ags_12=import_column_1),
                                       gebietsaenderung %>% 
                                         select(ags_12_before, ags_9_after, multiplier) %>% 
                                         filter(!is.na(ags_12_before)),
                                       by=c("ags_12"="ags_12_before")) %>%
                               mutate(ags_9=ifelse(is.na(ags_9_after), ags_9, ags_9_after)) %>%
                               select(-ags_9_after),
                             gebietsaenderung %>% 
                               filter(is.na(ags_12_before)) %>% 
                               select(ags_9_before, ags_9_after, multiplier),
                             by=c("ags_9"="ags_9_before")) %>%
    mutate(ags_9=ifelse(is.na(ags_9_after), ags_9, ags_9_after),
           multiplier=ifelse(is.na(multiplier.x),ifelse(is.na(multiplier.y),1,multiplier.y),multiplier.x)) %>%
    select(-multiplier.x, -multiplier.y) %>% 
    mutate_at(vars(paste0("import_column_",9:132)),funs(suppressWarnings(as.numeric(.)*multiplier))) %>% 
    group_by(ags_9) %>%
    summarise(res_fac_total=sum(import_column_9, na.rm=T),
              res_fac_share_res_build=ifelse(res_fac_total==0,0,sum(import_column_11, na.rm = T)/res_fac_total),
              res_fac_share_res_dorms=ifelse(res_fac_total==0,0,sum(import_column_12, na.rm = T)/res_fac_total),
              res_fac_share_other_build=ifelse(res_fac_total==0,0,sum(import_column_13, na.rm = T)/res_fac_total),
              res_fac_share_YOC_0_1918=ifelse(res_fac_total==0,0,sum(import_column_15, na.rm = T)/res_fac_total),
              res_fac_share_YOC_1919_1949=ifelse(res_fac_total==0,0,sum(import_column_16, na.rm = T)/res_fac_total),
              res_fac_share_YOC_1950_1959=ifelse(res_fac_total==0,0,sum(import_column_17, na.rm = T)/res_fac_total),
              res_fac_share_YOC_1960_1969=ifelse(res_fac_total==0,0,sum(import_column_18, na.rm = T)/res_fac_total),
              res_fac_share_YOC_1970_1979=ifelse(res_fac_total==0,0,sum(import_column_19, na.rm = T)/res_fac_total),
              res_fac_share_YOC_1980_1989=ifelse(res_fac_total==0,0,sum(import_column_20, na.rm = T)/res_fac_total),
              res_fac_share_YOC_1990_1999=ifelse(res_fac_total==0,0,sum(import_column_21, na.rm = T)/res_fac_total),
              res_fac_share_YOC_2000_2005=ifelse(res_fac_total==0,0,sum(import_column_22, na.rm = T)/res_fac_total),
              res_fac_share_YOC_2006_2011=ifelse(res_fac_total==0,0,sum(import_column_23, na.rm = T)/res_fac_total),
              res_fac_share_owner_assoc=ifelse(res_fac_total==0,0,sum(import_column_36, na.rm = T)/res_fac_total),
              res_fac_share_private_owned=ifelse(res_fac_total==0,0,sum(import_column_37, na.rm = T)/res_fac_total),
              res_fac_share_housing_coop=ifelse(res_fac_total==0,0,sum(import_column_38, na.rm = T)/res_fac_total),
              res_fac_share_commune=ifelse(res_fac_total==0,0,sum(import_column_39, na.rm = T)/res_fac_total),
              res_fac_share_private_assoc=ifelse(res_fac_total==0,0,sum(import_column_40, na.rm = T)/res_fac_total),
              res_fac_share_private_enterpr=ifelse(res_fac_total==0,0,sum(import_column_41, na.rm = T)/res_fac_total),
              res_fac_share_gov_owned=ifelse(res_fac_total==0,0,sum(import_column_42, na.rm = T)/res_fac_total),
              res_fac_share_NPO_owned=ifelse(res_fac_total==0,0,sum(import_column_43, na.rm = T)/res_fac_total),
              res_fac_share_district_heat=ifelse(res_fac_total==0,0,sum(import_column_45, na.rm = T)/res_fac_total),
              res_fac_share_floor_heat=ifelse(res_fac_total==0,0,sum(import_column_46, na.rm = T)/res_fac_total),
              res_fac_share_block_heat=ifelse(res_fac_total==0,0,sum(import_column_47, na.rm = T)/res_fac_total),
              res_fac_share_central_heat=ifelse(res_fac_total==0,0,sum(import_column_48, na.rm = T)/res_fac_total),
              res_fac_share_oven_heat=ifelse(res_fac_total==0,0,sum(import_column_49, na.rm = T)/res_fac_total),
              res_fac_share_no_heat=ifelse(res_fac_total==0,0,sum(import_column_50, na.rm = T)/res_fac_total),
              res_fac_share_1_apt=ifelse(res_fac_total==0,0,sum(import_column_52, na.rm = T)/res_fac_total),
              res_fac_share_2_apt=ifelse(res_fac_total==0,0,sum(import_column_53, na.rm = T)/res_fac_total),
              res_fac_share_3_6_apt=ifelse(res_fac_total==0,0,sum(import_column_54, na.rm = T)/res_fac_total),
              res_fac_share_7_12_apt=ifelse(res_fac_total==0,0,sum(import_column_55, na.rm = T)/res_fac_total),
              res_fac_share_13_more_apt=ifelse(res_fac_total==0,0,sum(import_column_56, na.rm = T)/res_fac_total),
              res_fac_share_detached_house=ifelse(res_fac_total==0,0,sum(import_column_58, na.rm = T)/res_fac_total),
              res_fac_share_duplex_house=ifelse(res_fac_total==0,0,sum(import_column_59, na.rm = T)/res_fac_total),
              res_fac_share_terraced_house=ifelse(res_fac_total==0,0,sum(import_column_60, na.rm = T)/res_fac_total),
              res_fac_share_other_house_type=ifelse(res_fac_total==0,0,sum(import_column_61, na.rm = T)/res_fac_total),
              apt_total=sum(import_column_77, na.rm = T),
              apt_YOC_share_0_1918=ifelse(apt_total==0,0,sum(import_column_78, na.rm = T)/apt_total),
              apt_YOC_share_1919_1948=ifelse(apt_total==0,0,sum(import_column_79, na.rm = T)/apt_total),
              apt_YOC_share_1949_1978=ifelse(apt_total==0,0,sum(import_column_80, na.rm = T)/apt_total),
              apt_YOC_share_1979_1986=ifelse(apt_total==0,0,sum(import_column_81, na.rm = T)/apt_total),
              apt_YOC_share_1987_1990=ifelse(apt_total==0,0,sum(import_column_82, na.rm = T)/apt_total),
              apt_YOC_share_1991_1995=ifelse(apt_total==0,0,sum(import_column_83, na.rm = T)/apt_total),
              apt_YOC_share_1996_2000=ifelse(apt_total==0,0,sum(import_column_84, na.rm = T)/apt_total),
              apt_YOC_share_2001_2004=ifelse(apt_total==0,0,sum(import_column_85, na.rm = T)/apt_total),
              apt_YOC_share_2005_2008=ifelse(apt_total==0,0,sum(import_column_86, na.rm = T)/apt_total),
              apt_YOC_share_2009_2011=ifelse(apt_total==0,0,sum(import_column_87, na.rm = T)/apt_total),
              apt_share_owner_assoc=ifelse(apt_total==0,0,sum(import_column_89, na.rm = T)/apt_total),
              apt_share_private_owned=ifelse(apt_total==0,0,sum(import_column_90, na.rm = T)/apt_total),
              apt_share_housing_coop=ifelse(apt_total==0,0,sum(import_column_91, na.rm = T)/apt_total),
              apt_share_commune=ifelse(apt_total==0,0,sum(import_column_92, na.rm = T)/apt_total),
              apt_share_private_assoc=ifelse(apt_total==0,0,sum(import_column_93, na.rm = T)/apt_total),
              apt_share_private_enterpr=ifelse(apt_total==0,0,sum(import_column_94, na.rm = T)/apt_total),
              apt_share_gov_owned=ifelse(apt_total==0,0,sum(import_column_95, na.rm = T)/apt_total),
              apt_share_NPO_owned=ifelse(apt_total==0,0,sum(import_column_96, na.rm = T)/apt_total),
              apt_share_district_heat=ifelse(apt_total==0,0,sum(import_column_98, na.rm = T)/apt_total),
              apt_share_floor_heat=ifelse(apt_total==0,0,sum(import_column_99, na.rm = T)/apt_total),
              apt_share_block_heat=ifelse(apt_total==0,0,sum(import_column_100, na.rm = T)/apt_total),
              apt_share_central_heat=ifelse(apt_total==0,0,sum(import_column_101, na.rm = T)/apt_total),
              apt_share_oven_heat=ifelse(apt_total==0,0,sum(import_column_102, na.rm = T)/apt_total),
              apt_share_no_heat=ifelse(apt_total==0,0,sum(import_column_103, na.rm = T)/apt_total),
              apt_share_owner_occupied=ifelse(apt_total==0,0,sum(import_column_105, na.rm = T)/apt_total),
              apt_share_rented=ifelse(apt_total==0,0,sum(import_column_106, na.rm = T)/apt_total),
              apt_share_holiday_home=ifelse(apt_total==0,0,sum(import_column_107, na.rm = T)/apt_total),
              apt_share_empty=ifelse(apt_total==0,0,sum(import_column_108, na.rm = T)/apt_total),
              apt_share_size_below_40=ifelse(apt_total==0,0,sum(import_column_110, na.rm = T)/apt_total),
              apt_share_size_40_59=ifelse(apt_total==0,0,sum(import_column_111, na.rm = T)/apt_total),
              apt_share_size_60_79=ifelse(apt_total==0,0,sum(import_column_112, na.rm = T)/apt_total),
              apt_share_size_80_99=ifelse(apt_total==0,0,sum(import_column_113, na.rm = T)/apt_total),
              apt_share_size_100_119=ifelse(apt_total==0,0,sum(import_column_114, na.rm = T)/apt_total),
              apt_share_size_120_139=ifelse(apt_total==0,0,sum(import_column_115, na.rm = T)/apt_total),
              apt_share_size_140_159=ifelse(apt_total==0,0,sum(import_column_116, na.rm = T)/apt_total),
              apt_share_size_160_179=ifelse(apt_total==0,0,sum(import_column_117, na.rm = T)/apt_total),
              apt_share_size_180_199=ifelse(apt_total==0,0,sum(import_column_118, na.rm = T)/apt_total),
              apt_share_size_over_200=ifelse(apt_total==0,0,sum(import_column_119, na.rm = T)/apt_total),
              apt_share_1_room=ifelse(apt_total==0,0,sum(import_column_121, na.rm = T)/apt_total),
              apt_share_2_rooms=ifelse(apt_total==0,0,sum(import_column_122, na.rm = T)/apt_total),
              apt_share_3_rooms=ifelse(apt_total==0,0,sum(import_column_123, na.rm = T)/apt_total),
              apt_share_4_rooms=ifelse(apt_total==0,0,sum(import_column_124, na.rm = T)/apt_total),
              apt_share_5_rooms=ifelse(apt_total==0,0,sum(import_column_125, na.rm = T)/apt_total),
              apt_share_6_rooms=ifelse(apt_total==0,0,sum(import_column_126, na.rm = T)/apt_total),
              apt_share_7_or_more_rooms=ifelse(apt_total==0,0,sum(import_column_127, na.rm = T)/apt_total),
              apt_share_detached_house=ifelse(apt_total==0,0,sum(import_column_129, na.rm = T)/apt_total),
              apt_share_duplex_house=ifelse(apt_total==0,0,sum(import_column_130, na.rm = T)/apt_total),
              apt_share_terraced_house=ifelse(apt_total==0,0,sum(import_column_131, na.rm = T)/apt_total),
              apt_share_other_house_type=ifelse(apt_total==0,0,sum(import_column_132, na.rm = T)/apt_total),
              .groups="drop") %>% 
    select(-starts_with("import_column"))
  
  zensus_hh_import=read_excel(path = paste0(path_to_zensus,"xlsx_HaushalteFamilien.xlsx"),
                              sheet = "Haushalte, Familien",trim_ws = T, col_names = paste0("import_column_",1:49))
  
  zensus_hh_data= left_join(left_join(zensus_hh_import %>% 
                                        filter(import_column_8=="Gemeinde") %>%
                                        mutate(ags_9=paste0(import_column_2,import_column_3,import_column_4,import_column_5)) %>% 
                                        rename(ags_12=import_column_1),
                                      gebietsaenderung %>% select(ags_12_before, ags_9_after, multiplier) %>% filter(!is.na(ags_12_before)),
                                      by=c("ags_12"="ags_12_before")) %>%
                              mutate(ags_9=ifelse(is.na(ags_9_after), ags_9, ags_9_after)) %>%
                              select(-ags_9_after),
                            gebietsaenderung %>% 
                              filter(is.na(ags_12_before)) %>% 
                              select(ags_9_before, ags_9_after, multiplier),
                            by=c("ags_9"="ags_9_before")) %>%
    mutate(ags_9=ifelse(is.na(ags_9_after), ags_9, ags_9_after),
           multiplier=ifelse(is.na(multiplier.x),ifelse(is.na(multiplier.y),1,multiplier.y),multiplier.x)) %>%
    select(-multiplier.x, -multiplier.y) %>% 
    mutate_at(vars(paste0("import_column_",9:49)),funs(suppressWarnings(as.numeric(.)*multiplier))) %>% 
    group_by(ags_9) %>%
    summarise(hh_total=sum(import_column_9, na.rm = T),
              hh_share_1_person=ifelse(hh_total==0,0,sum(import_column_10, na.rm = T)/hh_total),
              hh_share_dinks=ifelse(hh_total==0,0,sum(import_column_11, na.rm = T)/hh_total),
              hh_share_family=ifelse(hh_total==0,0,sum(import_column_12, na.rm = T)/hh_total),
              hh_share_single_parent=ifelse(hh_total==0,0,sum(import_column_13, na.rm = T)/hh_total),
              hh_share_shared_flat=ifelse(hh_total==0,0,sum(import_column_14, na.rm = T)/hh_total),
              hh_share_2_person=ifelse(hh_total==0,0,sum(import_column_25, na.rm = T)/hh_total),
              hh_share_3_person=ifelse(hh_total==0,0,sum(import_column_26, na.rm = T)/hh_total),
              hh_share_4_person=ifelse(hh_total==0,0,sum(import_column_27, na.rm = T)/hh_total),
              hh_share_5_person=ifelse(hh_total==0,0,sum(import_column_28, na.rm = T)/hh_total),
              hh_share_6_or_more_person=ifelse(hh_total==0,0,sum(import_column_29, na.rm = T)/hh_total),
              hh_share_only_elderly=ifelse(hh_total==0,0,sum(import_column_31, na.rm = T)/hh_total),
              hh_share_mixed_elderly_young=ifelse(hh_total==0,0,sum(import_column_32, na.rm = T)/hh_total),
              hh_share_only_young=ifelse(hh_total==0,0,sum(import_column_33, na.rm = T)/hh_total),
              .groups="drop") %>% 
    select(-starts_with("import_column"))
  
  zensus_data=inner_join(inner_join(zensus_pop_data, zensus_res_data, by=c("ags_9")), 
                         zensus_hh_data, by=c("ags_9"))
  
  return(zensus_data)
}

zensus_data=load_zensus_data(ags_book=ags_book, 
                             gebietsaenderung=gebietsaenderung)

# Regionalstatistik

load_rs_data=function(ags_book, gebietsaenderung){
  path_to_regionalstatistik="./Daten/Regionalstatistik/"
  
  # 12411-02-03-5 - Bevölkerung nach Geschlecht und Altersgruppen
  
  rs_12411_02_03_5_data=data.frame()
  for(current_year in 2016:2019){
    rs_12411_02_03_5_import=suppressMessages(read_excel(path = paste0(path_to_regionalstatistik,"12411-02-03-5_",current_year,".xlsx"),
                                                        sheet = "12411-02-03-5",trim_ws = T))
    colnames(rs_12411_02_03_5_import)= paste0("import_column_",1:ncol(rs_12411_02_03_5_import))
    
    rs_12411_02_03_5_data_year=left_join(left_join(rs_12411_02_03_5_import %>% 
                                                     mutate(ags_8=ifelse(str_length(import_column_1)==5,paste0(import_column_1,"000"),import_column_1)) %>%
                                                     mutate(ags_8=ifelse(ags_8 %in% c("02","11"),paste0(ags_8,"000000"),ags_8)),
                                                   gebietsaenderung %>% 
                                                     filter(change_year>=current_year) %>%
                                                     select(ags_8_before, ags_9_after, multiplier) %>% 
                                                     filter(!is.na(ags_8_before)),
                                                   by=c("ags_8"="ags_8_before")),
                                         ags_book %>% select(ags_8, ags_9),
                                         by=c("ags_8"="ags_8")) %>% 
      mutate(ags_9=ifelse(is.na(ags_9_after),ags_9,ags_9_after),
             multiplier=ifelse(is.na(multiplier),1,multiplier)) %>% 
      select(-ags_9_after) %>%
      filter(!is.na(ags_9)) %>%
      mutate_at(vars(paste0("import_column_",3:ncol(rs_12411_02_03_5_import))),funs(gsub("[/-]","0",.))) %>% 
      mutate_at(vars(paste0("import_column_",3:ncol(rs_12411_02_03_5_import))),funs(suppressWarnings(as.numeric(.)*multiplier))) %>% 
      group_by(ags_9) %>% 
      summarise(pop_share_age_below_3=ifelse(sum(import_column_20, na.rm=T)==0,0,sum(import_column_3, na.rm=T)/sum(import_column_20, na.rm=T)),
                pop_share_age_3_5=ifelse(sum(import_column_20, na.rm=T)==0,0,sum(import_column_4, na.rm=T)/sum(import_column_20, na.rm=T)),
                pop_share_age_6_9=ifelse(sum(import_column_20, na.rm=T)==0,0,sum(import_column_5, na.rm=T)/sum(import_column_20, na.rm=T)),
                pop_share_age_10_14=ifelse(sum(import_column_20, na.rm=T)==0,0,sum(import_column_6, na.rm=T)/sum(import_column_20, na.rm=T)),
                pop_share_age_15_17=ifelse(sum(import_column_20, na.rm=T)==0,0,sum(import_column_7, na.rm=T)/sum(import_column_20, na.rm=T)),
                pop_share_age_18_19=ifelse(sum(import_column_20, na.rm=T)==0,0,sum(import_column_8, na.rm=T)/sum(import_column_20, na.rm=T)),
                pop_share_age_20_24=ifelse(sum(import_column_20, na.rm=T)==0,0,sum(import_column_9, na.rm=T)/sum(import_column_20, na.rm=T)),
                pop_share_age_25_29=ifelse(sum(import_column_20, na.rm=T)==0,0,sum(import_column_10, na.rm=T)/sum(import_column_20, na.rm=T)),
                pop_share_age_30_34=ifelse(sum(import_column_20, na.rm=T)==0,0,sum(import_column_11, na.rm=T)/sum(import_column_20, na.rm=T)),
                pop_share_age_35_39=ifelse(sum(import_column_20, na.rm=T)==0,0,sum(import_column_12, na.rm=T)/sum(import_column_20, na.rm=T)),
                pop_share_age_40_44=ifelse(sum(import_column_20, na.rm=T)==0,0,sum(import_column_13, na.rm=T)/sum(import_column_20, na.rm=T)),
                pop_share_age_45_49=ifelse(sum(import_column_20, na.rm=T)==0,0,sum(import_column_14, na.rm=T)/sum(import_column_20, na.rm=T)),
                pop_share_age_50_54=ifelse(sum(import_column_20, na.rm=T)==0,0,sum(import_column_15, na.rm=T)/sum(import_column_20, na.rm=T)),
                pop_share_age_55_59=ifelse(sum(import_column_20, na.rm=T)==0,0,sum(import_column_16, na.rm=T)/sum(import_column_20, na.rm=T)),
                pop_share_age_60_64=ifelse(sum(import_column_20, na.rm=T)==0,0,sum(import_column_17, na.rm=T)/sum(import_column_20, na.rm=T)),
                pop_share_age_65_74=ifelse(sum(import_column_20, na.rm=T)==0,0,sum(import_column_18, na.rm=T)/sum(import_column_20, na.rm=T)),
                pop_share_age_above_75=ifelse(sum(import_column_20, na.rm=T)==0,0,sum(import_column_19, na.rm=T)/sum(import_column_20, na.rm=T)), 
                .groups="drop") %>% 
      mutate(year=current_year) %>% 
      select(-starts_with("import_column"))
    
    rs_12411_02_03_5_data=bind_rows(rs_12411_02_03_5_data,rs_12411_02_03_5_data_year)
  }
  
  
  # 12411-07-01-5 - Durchschnittsalter der Bevölkerung
  
  rs_12411_07_01_5_import=suppressMessages(read_excel(path = paste0(path_to_regionalstatistik,"12411-07-01-5.xlsx"),
                                                      sheet = "12411-07-01-5",trim_ws = T))
  colnames(rs_12411_07_01_5_import)= paste0("import_column_",1:ncol(rs_12411_07_01_5_import)) 
  
  rs_12411_07_01_5_data=data.frame()
  for(current_year in 2016:2019){
    # get start and end row in file
    start_row=which(rs_12411_07_01_5_import$import_column_1==paste0("31.12.",current_year))+1
    end_row=min(which(rs_12411_07_01_5_import$import_column_1==paste0("31.12.",current_year-1))-1,nrow(rs_12411_07_01_5_import))
    
    # get the population weights from other file
    rs_12411_02_03_5_import=suppressMessages(read_excel(path = paste0(path_to_regionalstatistik,"12411-02-03-5_",current_year,".xlsx"),
                                                        sheet = "12411-02-03-5",trim_ws = T))
    colnames(rs_12411_02_03_5_import)= paste0("import_column_",1:ncol(rs_12411_02_03_5_import))
    rs_12411_07_01_5_import_year=left_join(rs_12411_07_01_5_import,
                                           rs_12411_02_03_5_import %>% select(import_column_1, import_column_20), 
                                           by=c("import_column_1"="import_column_1")) %>%
      rename(import_column_6=import_column_20)
    
    rs_12411_07_01_5_data_year=left_join(left_join(rs_12411_07_01_5_import_year[start_row:end_row,] %>% 
                                                     mutate(ags_8=ifelse(str_length(import_column_1)==5,paste0(import_column_1,"000"),import_column_1)) %>%
                                                     mutate(ags_8=ifelse(ags_8 %in% c("02","11"),paste0(ags_8,"000000"),ags_8)),
                                                   gebietsaenderung %>% 
                                                     filter(change_year>=current_year) %>%
                                                     select(ags_8_before, ags_9_after, multiplier) %>% 
                                                     filter(!is.na(ags_8_before)),
                                                   by=c("ags_8"="ags_8_before")),
                                         ags_book %>% select(ags_8, ags_9),
                                         by=c("ags_8"="ags_8")) %>% 
      mutate(ags_9=ifelse(is.na(ags_9_after),ags_9,ags_9_after),
             multiplier=ifelse(is.na(multiplier),1,multiplier)) %>% 
      select(-ags_9_after) %>%
      filter(!is.na(ags_9)) %>%
      mutate_at(vars(paste0("import_column_",3:ncol(rs_12411_07_01_5_import_year))),funs(gsub("[/-]","0",.))) %>% 
      mutate_at(vars(paste0("import_column_",3:ncol(rs_12411_07_01_5_import_year))),funs(suppressWarnings(as.numeric(.)))) %>% 
      group_by(ags_9) %>% 
      summarise(pop_avg_age=ifelse(sum(import_column_6, na.rm=T)==0,0,sum(import_column_3*import_column_6*multiplier, na.rm=T)/sum(import_column_6*multiplier, na.rm=T)), 
                .groups="drop") %>% 
      mutate(year=current_year) %>% 
      select(-starts_with("import_column"))
    
    rs_12411_07_01_5_data=bind_rows(rs_12411_07_01_5_data,rs_12411_07_01_5_data_year)
  }
  
  
  # 12613-91-01-5 - Gestorbene
  
  rs_12613_91_01_5_import=suppressMessages(read_excel(path = paste0(path_to_regionalstatistik,"12613-91-01-5.xlsx"),
                                                      sheet = "12613-91-01-5",trim_ws = T))
  colnames(rs_12613_91_01_5_import)= paste0("import_column_",1:ncol(rs_12613_91_01_5_import)) 
  
  rs_12613_91_01_5_data=data.frame()
  for(current_year in 2016:2019){
    # get start and end row in file
    start_row=which(rs_12613_91_01_5_import$import_column_1==current_year)+1
    end_row=min(which(rs_12613_91_01_5_import$import_column_1==current_year-1)-1,nrow(rs_12613_91_01_5_import))
    
    # get the population weights from other file
    rs_12411_02_03_5_import=suppressMessages(read_excel(path = paste0(path_to_regionalstatistik,"12411-02-03-5_",current_year,".xlsx"),
                                                        sheet = "12411-02-03-5",trim_ws = T))
    colnames(rs_12411_02_03_5_import)= paste0("import_column_",1:ncol(rs_12411_02_03_5_import))
    rs_12613_91_01_5_import_year=left_join(rs_12613_91_01_5_import,rs_12411_02_03_5_import %>% select(import_column_1, import_column_20), by=c("import_column_1"="import_column_1")) %>%
      rename(import_column_4=import_column_20)
    
    rs_12613_91_01_5_data_year=left_join(left_join(rs_12613_91_01_5_import_year[start_row:end_row,] %>% 
                                                     mutate(ags_8=ifelse(str_length(import_column_1)==5,paste0(import_column_1,"000"),import_column_1)) %>%
                                                     mutate(ags_8=ifelse(ags_8 %in% c("02","11"),paste0(ags_8,"000000"),ags_8)),
                                                   gebietsaenderung %>% 
                                                     filter(change_year>=current_year) %>%
                                                     select(ags_8_before, ags_9_after, multiplier) %>% 
                                                     filter(!is.na(ags_8_before)),
                                                   by=c("ags_8"="ags_8_before")),
                                         ags_book %>% select(ags_8, ags_9),
                                         by=c("ags_8"="ags_8")) %>% 
      mutate(ags_9=ifelse(is.na(ags_9_after),ags_9,ags_9_after),
             multiplier=ifelse(is.na(multiplier),1,multiplier)) %>% 
      select(-ags_9_after) %>%
      filter(!is.na(ags_9)) %>%
      mutate_at(vars(paste0("import_column_",3:ncol(rs_12613_91_01_5_import_year))),funs(gsub("[/-]","0",.))) %>% 
      mutate_at(vars(paste0("import_column_",3:ncol(rs_12613_91_01_5_import_year))),funs(suppressWarnings(as.numeric(.)*multiplier))) %>% 
      group_by(ags_9) %>% 
      summarise(pop_share_death=ifelse(sum(import_column_4, na.rm=T)==0,0,sum(import_column_3, na.rm=T)/sum(import_column_4, na.rm=T)),
                .groups="drop") %>% 
      mutate(year=current_year) %>% 
      select(-starts_with("import_column"))
    
    rs_12613_91_01_5_data=bind_rows(rs_12613_91_01_5_data,rs_12613_91_01_5_data_year)
  }
  
  
  # 12711-91-01-5 - Fort- und Zu-Züge
  
  rs_12711_91_01_5_import=suppressMessages(read_excel(path = paste0(path_to_regionalstatistik,"12711-91-01-5.xlsx"),
                                                      sheet = "12711-91-01-5",trim_ws = T))
  colnames(rs_12711_91_01_5_import)= paste0("import_column_",1:ncol(rs_12711_91_01_5_import)) 
  
  rs_12711_91_01_5_data=data.frame()
  for(current_year in 2016:2019){
    # get start and end row in file
    start_row=which(rs_12711_91_01_5_import$import_column_1==current_year)+1
    end_row=min(which(rs_12711_91_01_5_import$import_column_1==current_year-1)-1,nrow(rs_12711_91_01_5_import))
    
    # get the population weights from other file
    rs_12411_02_03_5_import=suppressMessages(read_excel(path = paste0(path_to_regionalstatistik,"12411-02-03-5_",current_year,".xlsx"),
                                                        sheet = "12411-02-03-5",trim_ws = T))
    colnames(rs_12411_02_03_5_import)= paste0("import_column_",1:ncol(rs_12411_02_03_5_import))
    rs_12711_91_01_5_import_year=left_join(rs_12711_91_01_5_import,rs_12411_02_03_5_import %>% select(import_column_1, import_column_20), by=c("import_column_1"="import_column_1")) %>%
      rename(import_column_5=import_column_20)
    
    rs_12711_91_01_5_data_year=left_join(left_join(rs_12711_91_01_5_import_year[start_row:end_row,] %>% 
                                                     mutate(ags_8=ifelse(str_length(import_column_1)==5,paste0(import_column_1,"000"),import_column_1)) %>%
                                                     mutate(ags_8=ifelse(ags_8 %in% c("02","11"),paste0(ags_8,"000000"),ags_8)),
                                                   gebietsaenderung %>% 
                                                     filter(change_year>=current_year) %>%
                                                     select(ags_8_before, ags_9_after, multiplier) %>% 
                                                     filter(!is.na(ags_8_before)),
                                                   by=c("ags_8"="ags_8_before")),
                                         ags_book %>% select(ags_8, ags_9),
                                         by=c("ags_8"="ags_8")) %>% 
      mutate(ags_9=ifelse(is.na(ags_9_after),ags_9,ags_9_after),
             multiplier=ifelse(is.na(multiplier),1,multiplier)) %>% 
      select(-ags_9_after) %>%
      filter(!is.na(ags_9)) %>%
      mutate_at(vars(paste0("import_column_",3:ncol(rs_12711_91_01_5_import_year))),funs(gsub("[/-]","0",.))) %>% 
      mutate_at(vars(paste0("import_column_",3:ncol(rs_12711_91_01_5_import_year))),funs(suppressWarnings(as.numeric(.)*multiplier))) %>% 
      group_by(ags_9) %>% 
      summarise(pop_share_inflow=ifelse(sum(import_column_5, na.rm=T)==0,0,sum(import_column_3, na.rm=T)/sum(import_column_5, na.rm=T)),
                pop_share_outflow=ifelse(sum(import_column_5, na.rm=T)==0,0,sum(import_column_4, na.rm=T)/sum(import_column_5, na.rm=T)),
                .groups="drop") %>% 
      mutate(year=current_year) %>% 
      select(-starts_with("import_column"))
    
    rs_12711_91_01_5_data=bind_rows(rs_12711_91_01_5_data,rs_12711_91_01_5_data_year)
  }
  
  
  # 14111-01-03-5 - Vote shares
  
  rs_14111_01_03_5_import=suppressMessages(read_excel(path = paste0(path_to_regionalstatistik,"14111-01-03-5.xlsx"),
                                                      sheet = "14111-01-03-5",trim_ws = T))
  colnames(rs_14111_01_03_5_import)= paste0("import_column_",1:ncol(rs_14111_01_03_5_import)) 
  
  rs_14111_01_03_5_data=left_join(left_join(rs_14111_01_03_5_import %>% 
                                              mutate(ags_8=ifelse(str_length(import_column_1)==5,paste0(import_column_1,"000"),import_column_1)) %>%
                                              mutate(ags_8=ifelse(ags_8 %in% c("02","11"),paste0(ags_8,"000000"),ags_8)),
                                            gebietsaenderung %>% 
                                              filter(change_year>=current_year) %>%
                                              select(ags_8_before, ags_9_after, multiplier) %>% 
                                              filter(!is.na(ags_8_before)),
                                            by=c("ags_8"="ags_8_before")),
                                  ags_book %>% select(ags_8, ags_9),
                                  by=c("ags_8"="ags_8")) %>% 
    mutate(ags_9=ifelse(is.na(ags_9_after),ags_9,ags_9_after),
           multiplier=ifelse(is.na(multiplier),1,multiplier)) %>% 
    select(-ags_9_after) %>%
    filter(!is.na(ags_9)) %>%
    mutate_at(vars(paste0("import_column_",3:ncol(rs_14111_01_03_5_import))),funs(gsub("[/-]","0",.))) %>% 
    mutate_at(vars(paste0("import_column_",3:ncol(rs_14111_01_03_5_import))),funs(suppressWarnings(as.numeric(.)*multiplier))) %>% 
    group_by(ags_9) %>% 
    summarise(vote_share=ifelse(sum(import_column_3, na.rm=T)==0,0,sum(import_column_5, na.rm=T)/sum(import_column_3, na.rm=T)),
              vote_share_party_CDU_CSU=ifelse(sum(import_column_5, na.rm=T)==0,0,sum(import_column_6, na.rm=T)/sum(import_column_5, na.rm=T)),
              vote_share_party_SPD=ifelse(sum(import_column_5, na.rm=T)==0,0,sum(import_column_7, na.rm=T)/sum(import_column_5, na.rm=T)),
              vote_share_party_GRUNE=ifelse(sum(import_column_5, na.rm=T)==0,0,sum(import_column_8, na.rm=T)/sum(import_column_5, na.rm=T)),
              vote_share_party_FDP=ifelse(sum(import_column_5, na.rm=T)==0,0,sum(import_column_9, na.rm=T)/sum(import_column_5, na.rm=T)),
              vote_share_party_LINKE=ifelse(sum(import_column_5, na.rm=T)==0,0,sum(import_column_10, na.rm=T)/sum(import_column_5, na.rm=T)),
              vote_share_party_AFD=ifelse(sum(import_column_5, na.rm=T)==0,0,sum(import_column_11, na.rm=T)/sum(import_column_5, na.rm=T)),
              vote_share_party_OTHER=ifelse(sum(import_column_5, na.rm=T)==0,0,sum(import_column_12, na.rm=T)/sum(import_column_5, na.rm=T)),
              .groups="drop") %>% 
    select(-starts_with("import_column"))
  
  
  # 31111-01-02-5 - Genehmigungen neuer Wohnungen
  
  rs_31111_01_02_5_data=data.frame()
  for(current_year in 2016:2019){
    rs_31111_01_02_5_import=suppressMessages(read_excel(path = paste0(path_to_regionalstatistik,"31111-01-02-5_",current_year,".xlsx"),
                                                        sheet = "31111-01-02-5",trim_ws = T))
    colnames(rs_31111_01_02_5_import)= paste0("import_column_",1:ncol(rs_31111_01_02_5_import))
    
    rs_31111_01_02_5_data_year=left_join(left_join(rs_31111_01_02_5_import %>% 
                                                     mutate(ags_8=ifelse(str_length(import_column_1)==5,paste0(import_column_1,"000"),import_column_1)) %>%
                                                     mutate(ags_8=ifelse(ags_8 %in% c("02","11"),paste0(ags_8,"000000"),ags_8)),
                                                   gebietsaenderung %>% 
                                                     filter(change_year>=current_year) %>%
                                                     select(ags_8_before, ags_9_after, multiplier) %>% 
                                                     filter(!is.na(ags_8_before)),
                                                   by=c("ags_8"="ags_8_before")),
                                         ags_book %>% select(ags_8, ags_9),
                                         by=c("ags_8"="ags_8")) %>% 
      mutate(ags_9=ifelse(is.na(ags_9_after),ags_9,ags_9_after),
             multiplier=ifelse(is.na(multiplier),1,multiplier)) %>% 
      select(-ags_9_after) %>%
      filter(!is.na(ags_9)) %>%
      mutate_at(vars(paste0("import_column_",3:ncol(rs_31111_01_02_5_import))),funs(gsub("[/-]","0",.))) %>% 
      mutate_at(vars(paste0("import_column_",3:ncol(rs_31111_01_02_5_import))),funs(suppressWarnings(as.numeric(.)*multiplier))) %>% 
      group_by(ags_9) %>% 
      summarise(appr_res_fac_total=sum(import_column_3, na.rm=T),
                appr_res_fac_share_1_apt=ifelse(appr_res_fac_total==0,0,sum(import_column_4, na.rm=T)/appr_res_fac_total),
                appr_res_fac_share_2_apt=ifelse(appr_res_fac_total==0,0,sum(import_column_5, na.rm=T)/appr_res_fac_total),
                appr_res_fac_share_3_more_apt=ifelse(appr_res_fac_total==0,0,sum(import_column_6, na.rm=T)/appr_res_fac_total),
                appr_apt_total=sum(import_column_7, na.rm=T),
                appr_apt_total_area=sum(import_column_10, na.rm=T)*1000,
                .groups="drop") %>% 
      mutate(year=current_year) %>% 
      select(-starts_with("import_column"))
    
    rs_31111_01_02_5_data=bind_rows(rs_31111_01_02_5_data,rs_31111_01_02_5_data_year)
  }
  
  
  # 31111-02-02-5 - Genehmigungen neuer Nichtwohngebäude
  
  rs_31111_02_02_5_data=data.frame()
  for(current_year in 2016:2019){
    rs_31111_02_02_5_import=suppressMessages(read_excel(path = paste0(path_to_regionalstatistik,"31111-02-02-5_",current_year,".xlsx"),
                                                        sheet = "31111-02-02-5",trim_ws = T))
    colnames(rs_31111_02_02_5_import)= paste0("import_column_",1:ncol(rs_31111_02_02_5_import))
    
    rs_31111_02_02_5_data_year=left_join(left_join(rs_31111_02_02_5_import %>% 
                                                     mutate(ags_8=ifelse(str_length(import_column_1)==5,paste0(import_column_1,"000"),import_column_1)) %>%
                                                     mutate(ags_8=ifelse(ags_8 %in% c("02","11"),paste0(ags_8,"000000"),ags_8)),
                                                   gebietsaenderung %>% 
                                                     filter(change_year>=current_year) %>%
                                                     select(ags_8_before, ags_9_after, multiplier) %>% 
                                                     filter(!is.na(ags_8_before)),
                                                   by=c("ags_8"="ags_8_before")),
                                         ags_book %>% select(ags_8, ags_9),
                                         by=c("ags_8"="ags_8")) %>% 
      mutate(ags_9=ifelse(is.na(ags_9_after),ags_9,ags_9_after),
             multiplier=ifelse(is.na(multiplier),1,multiplier)) %>% 
      select(-ags_9_after) %>%
      filter(!is.na(ags_9)) %>%
      mutate_at(vars(paste0("import_column_",3:ncol(rs_31111_02_02_5_import))),funs(gsub("[/-]","0",.))) %>% 
      mutate_at(vars(paste0("import_column_",3:ncol(rs_31111_02_02_5_import))),funs(suppressWarnings(as.numeric(.)*multiplier))) %>% 
      group_by(ags_9) %>% 
      summarise(appr_non_res_fac_total=sum(import_column_3, na.rm=T),
                appr_non_res_fac_area=sum(import_column_4, na.rm=T)*1000,
                appr_non_res_fac_apt=sum(import_column_5, na.rm=T),
                .groups="drop") %>% 
      mutate(year=current_year) %>% 
      select(-starts_with("import_column"))
    
    rs_31111_02_02_5_data=bind_rows(rs_31111_02_02_5_data,rs_31111_02_02_5_data_year)
  }
  
  
  # 31111-03-02-5 - Genehmigungen aller Wohnungen
  
  rs_31111_03_02_5_import=suppressMessages(read_excel(path = paste0(path_to_regionalstatistik,"31111-03-02-5.xlsx"),
                                                      sheet = "31111-03-02-5",trim_ws = T))
  colnames(rs_31111_03_02_5_import)= paste0("import_column_",1:ncol(rs_31111_03_02_5_import)) 
  
  rs_31111_03_02_5_data=data.frame()
  for(current_year in 2016:2019){
    # get start and end row in file
    start_row=which(rs_31111_03_02_5_import$import_column_1==current_year)+1
    end_row=min(which(rs_31111_03_02_5_import$import_column_1==current_year-1)-1,nrow(rs_31111_03_02_5_import))
    
    rs_31111_03_02_5_data_year=left_join(left_join(rs_31111_03_02_5_import[start_row:end_row,] %>% 
                                                     mutate(ags_8=ifelse(str_length(import_column_1)==5,paste0(import_column_1,"000"),import_column_1)) %>%
                                                     mutate(ags_8=ifelse(ags_8 %in% c("02","11"),paste0(ags_8,"000000"),ags_8)),
                                                   gebietsaenderung %>% 
                                                     filter(change_year>=current_year) %>%
                                                     select(ags_8_before, ags_9_after, multiplier) %>% 
                                                     filter(!is.na(ags_8_before)),
                                                   by=c("ags_8"="ags_8_before")),
                                         ags_book %>% select(ags_8, ags_9),
                                         by=c("ags_8"="ags_8")) %>% 
      mutate(ags_9=ifelse(is.na(ags_9_after),ags_9,ags_9_after),
             multiplier=ifelse(is.na(multiplier),1,multiplier)) %>% 
      select(-ags_9_after) %>%
      filter(!is.na(ags_9)) %>%
      mutate_at(vars(paste0("import_column_",3:ncol(rs_31111_03_02_5_import))),funs(gsub("[/-]","0",.))) %>% 
      mutate_at(vars(paste0("import_column_",3:ncol(rs_31111_03_02_5_import))),funs(suppressWarnings(as.numeric(.)*multiplier))) %>% 
      group_by(ags_9) %>% 
      summarise(appr_apt_total_netto=sum(import_column_3, na.rm=T),
                appr_apt_total_changes=sum(abs(import_column_4), abs(import_column_5), abs(import_column_6), abs(import_column_7), na.rm=T),
                appr_apt_share_1_2_rooms=ifelse(appr_apt_total_changes==0,0,sum(import_column_4, na.rm=T)/appr_apt_total_changes),
                appr_apt_share_3_rooms=ifelse(appr_apt_total_changes==0,0,sum(import_column_5, na.rm=T)/appr_apt_total_changes),
                appr_apt_share_4_rooms=ifelse(appr_apt_total_changes==0,0,sum(import_column_6, na.rm=T)/appr_apt_total_changes),
                appr_apt_share_5_more_rooms=ifelse(appr_apt_total_changes==0,0,sum(import_column_7, na.rm=T)/appr_apt_total_changes),
                .groups="drop") %>% 
      mutate(year=current_year) %>% 
      select(-starts_with("import_column")) %>% 
      mutate()
    
    rs_31111_03_02_5_data=bind_rows(rs_31111_03_02_5_data,rs_31111_03_02_5_data_year)
  }
  
  
  # 31121-01-02-5 - Fertigstellung neuer Wohnungen
  
  rs_31121_01_02_5_data=data.frame()
  for(current_year in 2016:2019){
    rs_31121_01_02_5_import=suppressMessages(read_excel(path = paste0(path_to_regionalstatistik,"31121-01-02-5_",current_year,".xlsx"),
                                                        sheet = "31121-01-02-5",trim_ws = T))
    colnames(rs_31121_01_02_5_import)= paste0("import_column_",1:ncol(rs_31121_01_02_5_import))
    
    rs_31121_01_02_5_data_year=left_join(left_join(rs_31121_01_02_5_import %>% 
                                                     mutate(ags_8=ifelse(str_length(import_column_1)==5,paste0(import_column_1,"000"),import_column_1)) %>%
                                                     mutate(ags_8=ifelse(ags_8 %in% c("02","11"),paste0(ags_8,"000000"),ags_8)),
                                                   gebietsaenderung %>% 
                                                     filter(change_year>=current_year) %>%
                                                     select(ags_8_before, ags_9_after, multiplier) %>% 
                                                     filter(!is.na(ags_8_before)),
                                                   by=c("ags_8"="ags_8_before")),
                                         ags_book %>% select(ags_8, ags_9),
                                         by=c("ags_8"="ags_8")) %>% 
      mutate(ags_9=ifelse(is.na(ags_9_after),ags_9,ags_9_after),
             multiplier=ifelse(is.na(multiplier),1,multiplier)) %>% 
      select(-ags_9_after) %>%
      filter(!is.na(ags_9)) %>%
      mutate_at(vars(paste0("import_column_",3:ncol(rs_31121_01_02_5_import))),funs(gsub("[/-]","0",.))) %>% 
      mutate_at(vars(paste0("import_column_",3:ncol(rs_31121_01_02_5_import))),funs(suppressWarnings(as.numeric(.)*multiplier))) %>% 
      group_by(ags_9) %>% 
      summarise(new_res_fac_total=sum(import_column_3, na.rm=T),
                new_res_fac_share_1_apt=ifelse(new_res_fac_total==0,0,sum(import_column_4, na.rm=T)/new_res_fac_total),
                new_res_fac_share_2_apt=ifelse(new_res_fac_total==0,0,sum(import_column_5, na.rm=T)/new_res_fac_total),
                new_res_fac_share_3_more_apt=ifelse(new_res_fac_total==0,0,sum(import_column_6, na.rm=T)/new_res_fac_total),
                new_apt_total=sum(import_column_7, na.rm=T),
                new_apt_total_area=sum(import_column_10, na.rm=T)*1000,
                .groups="drop") %>% 
      mutate(year=current_year) %>% 
      select(-starts_with("import_column"))
    
    rs_31121_01_02_5_data=bind_rows(rs_31121_01_02_5_data,rs_31121_01_02_5_data_year)
  }
  
  
  # 31121-02-02-5 - Fertigstellung neuer Nichtwohngebäude
  
  rs_31121_02_02_5_data=data.frame()
  for(current_year in 2016:2019){
    rs_31121_02_02_5_import=suppressMessages(read_excel(path = paste0(path_to_regionalstatistik,"31121-02-02-5_",current_year,".xlsx"),
                                                        sheet = "31121-02-02-5",trim_ws = T))
    colnames(rs_31121_02_02_5_import)= paste0("import_column_",1:ncol(rs_31121_02_02_5_import))
    
    rs_31121_02_02_5_data_year=left_join(left_join(rs_31121_02_02_5_import %>% 
                                                     mutate(ags_8=ifelse(str_length(import_column_1)==5,paste0(import_column_1,"000"),import_column_1)) %>%
                                                     mutate(ags_8=ifelse(ags_8 %in% c("02","11"),paste0(ags_8,"000000"),ags_8)),
                                                   gebietsaenderung %>% 
                                                     filter(change_year>=current_year) %>%
                                                     select(ags_8_before, ags_9_after, multiplier) %>% 
                                                     filter(!is.na(ags_8_before)),
                                                   by=c("ags_8"="ags_8_before")),
                                         ags_book %>% select(ags_8, ags_9),
                                         by=c("ags_8"="ags_8")) %>% 
      mutate(ags_9=ifelse(is.na(ags_9_after),ags_9,ags_9_after),
             multiplier=ifelse(is.na(multiplier),1,multiplier)) %>% 
      select(-ags_9_after) %>%
      filter(!is.na(ags_9)) %>%
      mutate_at(vars(paste0("import_column_",3:ncol(rs_31121_02_02_5_import))),funs(gsub("[/-]","0",.))) %>% 
      mutate_at(vars(paste0("import_column_",3:ncol(rs_31121_02_02_5_import))),funs(suppressWarnings(as.numeric(.)*multiplier))) %>% 
      group_by(ags_9) %>% 
      summarise(new_non_res_fac_total=sum(import_column_3, na.rm=T),
                new_non_res_fac_area=sum(import_column_4, na.rm=T)*1000,
                new_non_res_fac_apt=sum(import_column_5, na.rm=T),
                .groups="drop") %>% 
      mutate(year=current_year) %>% 
      select(-starts_with("import_column"))
    
    rs_31121_02_02_5_data=bind_rows(rs_31121_02_02_5_data,rs_31121_02_02_5_data_year)
  }
  
  
  # 31121-03-02-5 - Fertigstellung aller Wohnungen
  
  rs_31121_03_02_5_data=data.frame()
  for(current_year in 2016:2019){
    rs_31121_03_02_5_import=suppressMessages(read_excel(path = paste0(path_to_regionalstatistik,"31121-03-02-5_",current_year,".xlsx"),
                                                        sheet = "31121-03-02-5",trim_ws = T))
    colnames(rs_31121_03_02_5_import)= paste0("import_column_",1:ncol(rs_31121_03_02_5_import))
    
    rs_31121_03_02_5_data_year=left_join(left_join(rs_31121_03_02_5_import %>% 
                                                     mutate(ags_8=ifelse(str_length(import_column_1)==5,paste0(import_column_1,"000"),import_column_1)) %>%
                                                     mutate(ags_8=ifelse(ags_8 %in% c("02","11"),paste0(ags_8,"000000"),ags_8)),
                                                   gebietsaenderung %>% 
                                                     filter(change_year>=current_year) %>%
                                                     select(ags_8_before, ags_9_after, multiplier) %>% 
                                                     filter(!is.na(ags_8_before)),
                                                   by=c("ags_8"="ags_8_before")),
                                         ags_book %>% select(ags_8, ags_9),
                                         by=c("ags_8"="ags_8")) %>% 
      mutate(ags_9=ifelse(is.na(ags_9_after),ags_9,ags_9_after),
             multiplier=ifelse(is.na(multiplier),1,multiplier)) %>% 
      select(-ags_9_after) %>%
      filter(!is.na(ags_9)) %>%
      mutate_at(vars(paste0("import_column_",3:ncol(rs_31121_03_02_5_import))),funs(gsub("[/-]","0",.))) %>% 
      mutate_at(vars(paste0("import_column_",3:ncol(rs_31121_03_02_5_import))),funs(suppressWarnings(as.numeric(.)*multiplier))) %>% 
      group_by(ags_9) %>% 
      summarise(new_apt_total_netto=sum(import_column_3, na.rm=T),
                new_apt_total_changes=sum(abs(import_column_4), abs(import_column_5), abs(import_column_6), abs(import_column_7), na.rm=T),
                new_apt_share_1_2_rooms=ifelse(new_apt_total_changes==0,0,sum(import_column_4, na.rm=T)/new_apt_total_changes),
                new_apt_share_3_rooms=ifelse(new_apt_total_changes==0,0,sum(import_column_5, na.rm=T)/new_apt_total_changes),
                new_apt_share_4_rooms=ifelse(new_apt_total_changes==0,0,sum(import_column_6, na.rm=T)/new_apt_total_changes),
                new_apt_share_5_more_rooms=ifelse(new_apt_total_changes==0,0,sum(import_column_7, na.rm=T)/new_apt_total_changes),
                .groups="drop") %>% 
      mutate(year=current_year) %>% 
      select(-starts_with("import_column")) %>% 
      mutate()
    
    rs_31121_03_02_5_data=bind_rows(rs_31121_03_02_5_data,rs_31121_03_02_5_data_year)
  }
  
  
  # AI016-2-5 - Verdienste und einkommen
  
  rs_AI016_2_5_import=suppressMessages(read_excel(path = paste0(path_to_regionalstatistik,"AI016-2-5.xlsx"),
                                                  sheet = "AI016-2-5",trim_ws = T))
  colnames(rs_AI016_2_5_import)= paste0("import_column_",1:ncol(rs_AI016_2_5_import)) 
  
  rs_AI016_2_5_data=data.frame()
  for(current_year in 2016:2016){
    # get start and end row in file
    start_row=which(rs_AI016_2_5_import$import_column_1==current_year)+1
    end_row=min(which(rs_AI016_2_5_import$import_column_1==current_year-1)-1,nrow(rs_AI016_2_5_import))
    
    rs_AI016_2_5_data_year=left_join(left_join(rs_AI016_2_5_import[start_row:end_row,] %>% 
                                                 mutate(ags_8=ifelse(str_length(import_column_1)==5,paste0(import_column_1,"000"),import_column_1)) %>%
                                                 mutate(ags_8=ifelse(ags_8 %in% c("02","11"),paste0(ags_8,"000000"),ags_8)),
                                               gebietsaenderung %>% 
                                                 filter(change_year>=current_year) %>%
                                                 select(ags_8_before, ags_9_after, multiplier) %>% 
                                                 filter(!is.na(ags_8_before)),
                                               by=c("ags_8"="ags_8_before")),
                                     ags_book %>% select(ags_8, ags_9, Population),
                                     by=c("ags_8"="ags_8")) %>% 
      mutate(ags_9=ifelse(is.na(ags_9_after),ags_9,ags_9_after),
             multiplier=ifelse(is.na(multiplier),1,multiplier)) %>% 
      select(-ags_9_after) %>%
      filter(!is.na(ags_9)) %>%
      mutate_at(vars(paste0("import_column_",3:ncol(rs_AI016_2_5_import))),funs(gsub("[/-]","0",.))) %>% 
      mutate_at(vars(paste0("import_column_",3:ncol(rs_AI016_2_5_import))),funs(suppressWarnings(as.numeric(.)*multiplier))) %>% 
      group_by(ags_9) %>% 
      summarise(income_pp=sum(import_column_3*Population, na.rm=T)/max(1,sum(Population, na.rm=T))*1000,
                .groups="drop") %>% 
      #mutate(year=current_year) %>% 
      select(-starts_with("import_column")) %>% 
      mutate()
    
    rs_AI016_2_5_data=bind_rows(rs_AI016_2_5_data,rs_AI016_2_5_data_year)
  }
  
  rs_data=full_join(full_join(full_join(full_join(full_join(full_join(full_join(full_join(full_join(full_join(full_join(rs_12411_02_03_5_data,
                                                                                                                        rs_12411_07_01_5_data, by=c("ags_9","year")),
                                                                                                              rs_12613_91_01_5_data, by=c("ags_9","year")),
                                                                                                    rs_12711_91_01_5_data, by=c("ags_9","year")),
                                                                                          rs_14111_01_03_5_data, by=c("ags_9")),
                                                                                rs_31111_01_02_5_data, by=c("ags_9","year")),
                                                                      rs_31111_02_02_5_data, by=c("ags_9","year")),
                                                            rs_31111_03_02_5_data, by=c("ags_9","year")),
                                                  rs_31121_01_02_5_data, by=c("ags_9","year")),
                                        rs_31121_02_02_5_data, by=c("ags_9","year")),
                              rs_31121_03_02_5_data, by=c("ags_9","year")),
                    rs_AI016_2_5_data, by=c("ags_9"))
  
  return(rs_data)
}

rs_data=load_rs_data(ags_book=ags_book, 
                     gebietsaenderung=gebietsaenderung)

# WZ data

load_WZ_data=function(arbeitsmarkt_data, arbeitsmarkt_LK_data, ags_book, impute=T){
  
  path_to_WZ="./Daten/WZ_data/"
  file_name="314956_SvB_Gem_WZ08_edit.xlsx"
  years=c(2016, 2017, 2018, 2019)
  
  # Load all worksheets of Wz file 
  
  WZ_data=data.frame()
  for(current_BL in 1:16){
    WZ_BL_import=suppressMessages(read_excel(path = paste0(path_to_WZ,file_name),
                                             sheet = current_BL+1,trim_ws = T))
    WZ_BL_import=suppressMessages(bind_cols(WZ_BL_import[seq(from=11, to=floor((nrow(WZ_BL_import)-1-11)/4)*4+11, by=4),1][rep(1:floor((nrow(WZ_BL_import)-11)/4),each=length(years)),],
                                            rep(years, times=floor((nrow(WZ_BL_import)-11)/4)),
                                            WZ_BL_import[11:(nrow(WZ_BL_import)-1),3:24]))
    colnames(WZ_BL_import)= c("ags_9","year","WZ_total",
                              paste0("ft_empl_wp_share_WZ_",
                                     c("A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U")))
    WZ_BL_import = WZ_BL_import %>% mutate(ags_9=substr(ags_9,1,9))
    WZ_BL_import[,3:24]=suppressMessages(sapply(WZ_BL_import[,3:24],as.numeric))
    
    # If WZ_total is not reported take it from other source
    WZ_BL_import=left_join(WZ_BL_import, 
                           arbeitsmarkt_data %>% select(ags_9, year, empl_wp_total),
                           by=c("year","ags_9")) %>% 
      rowwise() %>% 
      mutate(WZ_total=max(1,rowSums(across(starts_with("ft_empl_wp_share_WZ_")), na.rm = T),ifelse(is.na(WZ_total), empl_wp_total, WZ_total))) %>% 
      select(-empl_wp_total)
    
    WZ_data=bind_rows(WZ_data, WZ_BL_import)
  }
  
  if(impute){
    old_NAs=Inf
    while(sum(is.na(WZ_data))<old_NAs){
      old_NAs=sum(is.na(WZ_data))
      
      # impute missing values (only one missing)
      replace_rows=which(rowSums(is.na(WZ_data[,4:24]))==1)
      replace_values=pmax(0,WZ_data$WZ_total-rowSums(WZ_data[,4:24], na.rm = T))
      for(current_col in 4:24){
        current_rows=replace_rows[is.na(WZ_data[replace_rows,current_col])]
        WZ_data[current_rows,current_col]=replace_values[current_rows]
      }
      
      WZ_data=left_join(left_join(WZ_data, 
                                  ags_book %>% select(ags_5, ags_9) %>% distinct(),
                                  by="ags_9"),
                        arbeitsmarkt_LK_data,
                        by = c("year", "ags_5")) %>% 
        rowwise() %>%
        mutate(Still_missing=WZ_total-rowSums(across(starts_with("ft_empl_wp_share_WZ_")), na.rm = T))
      
      for(current_WZ in c("A","C","F","G","H","I","J","K","N","P","Q")){
        WZ_data_current=WZ_data[,c("year","ags_5","Still_missing",paste0(c("ft_empl_wp_share_WZ_","WZ__"),current_WZ))]
        colnames(WZ_data_current)=c("year","ags_5","Still_missing","Gemeinde","LK")
        WZ_data_current=WZ_data_current %>% 
          mutate(NAs=is.na(Gemeinde)) %>% 
          group_by(ags_5, year, Still_missing) %>% 
          mutate(NAs=sum(NAs)) %>% 
          ungroup() %>% 
          mutate(to_distribute=max(min(LK-sum(Gemeinde, na.rm = T),Still_missing),0)) %>%
          mutate(Gemeinde=ifelse(is.na(Gemeinde) & !is.na(to_distribute) & NAs<=1,to_distribute,Gemeinde))
        WZ_data[,paste0("ft_empl_wp_share_WZ_",current_WZ)]=WZ_data_current$Gemeinde
      }
      WZ_data=WZ_data %>% select(-starts_with("WZ__"),-ags_5, -Still_missing)
    }
    
    # Get shares of WZ in Germany
    WZ_DE_import=suppressMessages(read_excel(path = "./Daten/Arbeitsmarkt_landkreis/13111-0003.xlsx",
                                             sheet ="13111-0003",trim_ws = T))
    WZ_DE_import=suppressMessages(bind_cols(WZ_DE_import[(WZ_DE_import[,3]=="Insgesamt") %in% TRUE,4:7][1:21,],
                                            paste0("WZ_",c("A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U"))))
    WZ_DE_import[,1:4]=sapply(WZ_DE_import[,1:4], as.numeric)
    
    colnames(WZ_DE_import)=c("y2016","y2017","y2018","y2019","WZ")
    WZ_DE_import=pivot_longer(data=WZ_DE_import, cols = c("y2016","y2017","y2018","y2019"),names_to="year", values_to="employees") %>% mutate(year=as.numeric(substr(year,2,5))) %>%
      group_by(year) %>% mutate(WZ_share=employees/sum(employees)) %>% ungroup() %>% select(-employees)
    WZ_DE_import=pivot_wider(WZ_DE_import, id_cols = "year",names_from = "WZ",values_from = "WZ_share")
    # impute missing values (multiple missing)
    for(current_row in 1:nrow(WZ_data)){
      WZ_data_row=WZ_data[current_row,]
      na_columns=which(is.na(WZ_data_row))
      if(length(na_columns)>=1){
        replace_values=WZ_DE_import[WZ_DE_import$year==WZ_data_row$year, na_columns-2]
        replace_values=replace_values/sum(replace_values)
        WZ_data[current_row,na_columns]=max(0,WZ_data_row$WZ_total-sum(WZ_data_row[,4:24], na.rm = T))*replace_values
      }
    } 
  }
  WZ_data=WZ_data %>% ungroup() %>% mutate(across(where(is.numeric),~replace_na(.,0)))
  WZ_data$WZ_total=rowSums(select(WZ_data, ft_empl_wp_share_WZ_A:ft_empl_wp_share_WZ_U))
  # Make shares
  WZ_data=WZ_data %>% mutate_at(vars(ft_empl_wp_share_WZ_A:ft_empl_wp_share_WZ_U),~./WZ_total)
  
  return(WZ_data)
}

WZ_data=load_WZ_data(arbeitsmarkt_data=arbeitsmarkt_data, arbeitsmarkt_LK_data, ags_book, impute=T)

# Broadband data

load_broadband_data=function(ags_book){
  
  breitband_import=read_excel("./Daten/Mobilfunk/2021_BBA_Gemeindedaten_Privat-Gewerbe.xls")
  
  breitband_import=breitband_import %>%
    mutate(gem=ifelse(lan=="11","11000000",gem)) %>% #Berlin
    mutate(gem=ifelse(lan=="02","02000000",gem)) %>% #Hamburg
    mutate(gem=ifelse(krs=="09162","09162000",gem)) %>% #München
    mutate(gem=ifelse(krs=="05315","05315000",gem)) %>% #Köln
    mutate(gem=ifelse(krs=="04011","04011000",gem)) #Bremen

  breitband_ags=left_join(left_join(left_join(breitband_import,
                                              gebietsaenderung %>% select(ags_8_before, ags_8_after, multiplier),
                                              by=c("gem"="ags_8_before"))  %>% 
                                      mutate(ags_8=ifelse(is.na(ags_8_after),gem,ags_8_after),
                                             multiplier=ifelse(is.na(multiplier),1,multiplier)) %>%
                                      select(ags_8, multiplier, starts_with("priv_alle")),
                                    ags_book %>% select(ags_9, ags_8) %>% unique(),
                                    by="ags_8"),
                          gvz_data %>% filter(year==2019) %>% select(ags_9, population_total, urbanization),
                          by="ags_9") %>% 
    filter(population_total>0) %>%
    mutate(population_total=multiplier*population_total) %>%
    group_by(ags_9) %>%
    summarise(across(starts_with(c("priv_alle","latitude","longitude","population_total")),~ weighted.mean(.,population_total))) %>%
    select(-population_total) %>%
    mutate(across(starts_with("priv",replace_na(0))))
  
  return(breitband_ags)
}

broadband_data=load_broadband_data(ags_book)

# Join all data frames together
municipality_data_full=left_join(left_join(left_join(left_join(left_join(left_join(gvz_data,
                                                                                   arbeitsmarkt_data,
                                                                                   by=c("ags_9","year")),
                                                                         zensus_data,
                                                                         by="ags_9"),
                                                               rs_data,
                                                               by=c("ags_9","year")),
                                                     WZ_data %>% select(-WZ_total),
                                                     by=c("ags_9","year")),
                                           ags_book %>% select(ags_9, ags_5, municipality_vb) %>% distinct(),
                                           by="ags_9"),
                                 broadband_data,
                                 by="ags_9") %>%
  relocate(ags_5, ags_9, municipality_vb) %>%
  mutate(across(starts_with("priv_alle"),~replace_na(.,0)))


# Discard Gemeindeverbände with low population size --------------------------------------------

municipality_data=municipality_data_full %>% filter(year==2019, population_total>=2500)

# Import price data from RE21 --------------------------------------------

load_RE21_data=function(ags_book, gebietsaenderung){
  path_to_RE21_data="./Daten/Price_data_RE21/"
  
  # Prices
  RE21_price_data_import=readRDS(paste0(path_to_RE21_data,"data_delivery_211215/ags_price_data.rds"))
  RE21_price_data_import=RE21_price_data_import %>%
    rename(ags_8=ags) %>%
    mutate(year=as.integer(substr(qy,1,4)),
           quarter=case_when((qy*100) %% 100 == 0 ~ "Q1",
                             (qy*100) %% 100 == 25 ~ "Q2",
                             (qy*100) %% 100 == 50 ~ "Q3",
                             (qy*100) %% 100 == 75 ~ "Q4",
                             TRUE ~"Q?")) %>%
    rename(c("rent_residential"="rent","sale_residential"="sale")) %>%
    select(-qy) %>%
    pivot_longer(cols = contains(c("sale","rent")),names_to = "type",values_to = "price") %>%
    separate(col=type, into=c("type","sector"),sep = "_",remove = T)
  
  RE21_price_data_import=RE21_price_data_import %>% 
    arrange(ags_8, sector, type, year, quarter) %>%
    mutate(ags_8=ifelse(ags_8<10000000,paste0("0",trimws(format(ags_8, scientific = FALSE))),
                        trimws(format(ags_8, scientific = FALSE)))) %>%
    mutate(RE21_price_id=paste(quarter,year,type,sector,sep="_"))

  # Features 
  RE21_features_import=readRDS(paste0(path_to_RE21_data,"data_delivery_211226/sdo_data_full.rds"))
  RE21_features_import=RE21_features_import %>% 
    pivot_longer(names_to = "id",
                 cols = contains(c("rent","sale")), 
                 values_to = "value") %>% 
    mutate(year=as.integer(substr(qy,1,4)),
           quarter=case_when((qy*100) %% 100 == 0 ~ "Q1",
                             (qy*100) %% 100 == 25 ~ "Q2",
                             (qy*100) %% 100 == 50 ~ "Q3",
                             (qy*100) %% 100 == 75 ~ "Q4",
                             TRUE ~"Q?")) %>%
    separate(col="id",
             into=c("type","sector","measure"),
             extra="merge") %>% 
    mutate(sector=str_replace(sector,"resid","residential")) %>% 
    select(-qy) %>% 
    pivot_wider(names_from = "measure",
                values_from="value") %>%
    rename("ags_8"="ags_recent") %>%
    mutate(ags_8=ifelse(ags_8<10000000,paste0("0",trimws(format(ags_8, scientific = FALSE))),
                        trimws(format(ags_8, scientific = FALSE)))) 
  
  # Join prices and features and the gebietsaenderung
  RE21_price_data=left_join(left_join(left_join(RE21_price_data_import, 
                                                RE21_features_import,
                                                by=c("ags_8","quarter","year","type","sector")),
                                      gebietsaenderung %>% 
                                        select(ags_8_before, ags_8_after, ags_9_after, multiplier) %>% 
                                        filter(!is.na(ags_8_before)),
                                      by=c("ags_8"="ags_8_before"))%>% 
                              mutate(ags_8=ifelse(is.na(ags_8_after),ags_8,ags_8_after)),
                            ags_book %>% select(ags_8, ags_9, Population, municipality_vb),
                            by=c("ags_8"="ags_8")) %>% 
    mutate(ags_9=ifelse(is.na(ags_9_after),ags_9,ags_9_after),
           multiplier=ifelse(is.na(multiplier),1,multiplier)) %>% 
    select(-ags_9_after, -ags_8_after) %>%
    group_by(ags_9, municipality_vb, RE21_price_id,year,quarter,type,sector) %>%
    summarise(price=sum(price*multiplier*Population, na.rm = T)/sum(multiplier*Population, na.rm = T),
              across(starts_with("min_"), ~ min(.x)),
              across(starts_with("max_"), ~ max(.x)),
              across(starts_with(c("mean_","sd_","house_share","price")), ~ ifelse(sum(.x, na.rm = T)==0,NA,sum(.x*multiplier*Population, na.rm = T))/
                       ifelse(sum(multiplier*Population, na.rm = T)==0,NA,sum(multiplier*Population, na.rm = T))),
              .groups="drop") %>%
    relocate(ags_9, municipality_vb, RE21_price_id) %>% 
    arrange(ags_9, year, quarter, type, sector)
  
  return(RE21_price_data %>% filter(!is.na(municipality_vb)))
}

RE21_price_data=load_RE21_data(ags_book=ags_book, 
                               gebietsaenderung=gebietsaenderung)

# Import Corona cases from RKI --------------------------------------------

# download latest file here:
# https://www.arcgis.com/home/item.html?id=f10774f1c63e40168479a1feb6c7ca74

load_covid_data=function(latest_file, municipality_data){
  
  RKI_file_load=read.csv(file=paste0("./Daten/CoronaCases/",latest_file), stringsAsFactors = F, encoding = "UTF-8")
  
  # Convert dates
  RKI_file= RKI_file_load %>% mutate(date=as.Date(Meldedatum), Year_Month=format(date, '%y_%m'))
  colnames(RKI_file)[1]=c("IdLandkreis")
  # For Berlin there is the cnesus data only for the full Landkreis
  RKI_file[RKI_file$IdLandkreis>= 11000 & RKI_file$IdLandkreis<12000,"IdLandkreis"]=11000
  
  # covert to ags_5
  RKI_file =RKI_file %>% mutate(ags_5=ifelse(IdLandkreis<10000,paste0("0",IdLandkreis),paste(IdLandkreis)))
  
  #Cases
  #Aggregate to the Monthly, Age level
  Cases_by_LK_Age_Month=RKI_file %>% filter(NeuerFall>=0) %>% 
    group_by(ags_5, Altersgruppe, Year_Month) %>% 
    summarise(Cases=sum(AnzahlFall), .groups="drop")
  Cases_by_LK_Age_Month=as.data.frame(
    left_join(full_join(Cases_by_LK_Age_Month %>% select(ags_5, Year_Month, Altersgruppe, Cases), 
                        crossing(RKI_file$ags_5, RKI_file$Year_Month, RKI_file$Altersgruppe), 
                        by = c("ags_5"="RKI_file$ags_5","Year_Month"="RKI_file$Year_Month","Altersgruppe"="RKI_file$Altersgruppe")),
              Cases_by_LK_Age_Month %>% 
                select(ags_5) %>% 
                distinct(), 
              by = c("ags_5"="ags_5")) %>% 
      select(ags_5, Year_Month, Altersgruppe, Cases)) %>% 
    arrange(ags_5, Year_Month, Altersgruppe)
  Cases_by_LK_Age_Month$Cases[is.na(Cases_by_LK_Age_Month$Cases)]=0
  
  # Aggregate to monthly level
  Cases_by_LK_Month=Cases_by_LK_Age_Month %>% 
    group_by(ags_5, Year_Month) %>% 
    summarise(Cases=sum(Cases), .groups="drop")
  
  # Merge the census information to generate case-share
  pop_LK=municipality_data %>% group_by(ags_5) %>% summarise(population=sum(population_total), .groups="drop")
  
  Cases_by_LK_Month=left_join(Cases_by_LK_Month, pop_LK, by=c("ags_5"))
  Cases_by_LK_Month= Cases_by_LK_Month %>% mutate(Cases_share=round(Cases/population,digits = 5)) %>%
    select(-population)
  
  
  #Death
  
  # Aggregate to the Monthly, Age level
  Death_by_LK_Age_Month=RKI_file %>% 
    filter(NeuerTodesfall>=0) %>% 
    group_by(ags_5, Altersgruppe, Year_Month) %>% 
    summarise(Death=sum(AnzahlTodesfall), .groups="drop")
  Death_by_LK_Age_Month=as.data.frame(
    left_join(full_join(Death_by_LK_Age_Month %>% select(ags_5, Year_Month, Altersgruppe, Death), 
                        crossing(RKI_file$ags_5, RKI_file$Year_Month, RKI_file$Altersgruppe), 
                        by = c("ags_5"="RKI_file$ags_5","Year_Month"="RKI_file$Year_Month","Altersgruppe"="RKI_file$Altersgruppe")),
              Cases_by_LK_Age_Month %>% select(ags_5) %>% 
                distinct(), 
              by = c("ags_5"="ags_5")) %>% 
      select(ags_5, Year_Month, Altersgruppe, Death)) %>% arrange(ags_5, Year_Month, Altersgruppe)
  Death_by_LK_Age_Month$Death[is.na(Death_by_LK_Age_Month$Death)]=0
  
  # Aggregate to monthly level
  Death_by_LK_Month=Death_by_LK_Age_Month %>% 
    group_by(ags_5, Year_Month) %>% 
    summarise(Death=sum(Death), .groups="drop")
  
  Death_by_LK_Month=left_join(Death_by_LK_Month, pop_LK, by=c("ags_5"))
  Death_by_LK_Month= Death_by_LK_Month %>% mutate(Death_share=round(Death/population,digits = 6)) %>% 
    select(-population)
  
  # Join cases and death together
  Cases_by_LK_Month=full_join(Cases_by_LK_Month,Death_by_LK_Month, by=c("ags_5", "Year_Month")) %>% 
    arrange(ags_5, Year_Month)
  Cases_by_LK_Age_Month=full_join(Cases_by_LK_Age_Month,Death_by_LK_Age_Month, by=c("ags_5", "Year_Month","Altersgruppe")) %>% 
    arrange(ags_5, Year_Month)
  
  # calculate cumulative cases and deaths
  Cumulative_cases_by_LK=Cases_by_LK_Month %>% 
    group_by(ags_5) %>% 
    mutate(Covid_cum_cases=cumsum(Cases),
           Covid_cum_cases_share=cumsum(Cases_share),
           Covid_cum_death=cumsum(Death),
           Covid_cum_death_share=cumsum(Death_share)) %>%
    ungroup()
  Cumulative_cases_by_LK=pivot_wider(data = Cumulative_cases_by_LK,
                                     id_cols=c("ags_5"),
                                     names_from="Year_Month",
                                     values_from = c("Covid_cum_cases","Covid_cum_cases_share","Covid_cum_death","Covid_cum_death_share"),
                                     names_glue = "{.value}_{Year_Month}",
                                     names_sort=T)
  
  return(list("Cases_by_LK_Month"=Cases_by_LK_Month,
              "Cases_by_LK_Age_Month"=Cases_by_LK_Age_Month,
              "Cumulative_cases_by_LK"=Cumulative_cases_by_LK))
}

covid_data=load_covid_data(latest_file="RKI_COVID19_21_09_08.csv", municipality_data = municipality_data_full)



# Generate Treatment index ------------------------------------------------

generate_treatment_index=function(municipality_data, ags_book){
  path_to_KA="./Daten/Kurzarbeit/"
  file_name="Kurzarbeit_by_Landkreis.xlsx"
  WZ_string=c("A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U")
  
  KA_data=data.frame()
  number_of_month=13
  for(current_BL in 1:16){
    KA_import=suppressMessages(read_excel(path = paste0(path_to_KA,file_name),
                                          sheet = current_BL,trim_ws = T))
    KA_current=data.frame("WZ"=paste0("WZ_",WZ_string))
    year_month=""
    for(i in 1:number_of_month){
      year_month=(i-1)%%12+1
      year_month=paste(20+floor((i-1)/12),ifelse(year_month<10,paste0(0,year_month),year_month),sep="_")
      KA_current=suppressWarnings(bind_cols(KA_current,
                                            data.frame("no_name"=as.numeric(unname(unlist((KA_import[8:28,3+number_of_month-i])))) %>% replace_na(0)/
                                                         pmax(unname(unlist(KA_import[8:28,8+number_of_month] %>% replace_na(list(1)))),1))))
      names(KA_current)[ncol(KA_current)] =paste0("KA_share_",year_month)
    }
    total_names=str_replace(colnames(KA_current)[-1],pattern = "share",replacement = "total_BL")
    KA_current=pivot_wider(KA_current, values_from =  "KA_share_20_01":paste0("KA_share_",year_month),names_from = "WZ")
    KA_current$BL=ifelse(current_BL<10,paste0("0",current_BL),paste(current_BL))
    KA_current[,total_names]=t(as.numeric(KA_import[7,paste0("...",(2+number_of_month):3)]))
    KA_data=bind_rows(KA_data, KA_current)
  }
  
  # Join to data
  municipality_data_TI=left_join(left_join(municipality_data, 
                                           ags_book %>% select(ags_9, ags_2) %>% distinct(), 
                                           by=c("ags_9")),
                                 KA_data, 
                                 by=c("ags_2"="BL"))
  
  # Import HOmeoffice capability
  path_to_HO="./Daten/Homeoffice/"
  file_name="wfh_nace2.csv"
  file_name_WZ="wz-heft-d-0-201912-xlsx.xlsx"
  file_dictionary="Excelliste-WZ-2008.xls"
  
  HO_import=suppressMessages(read_csv(file=paste0(path_to_HO,file_name), col_names = T))
  HO_WZ_import=suppressMessages(read_excel(path = paste0(path_to_HO,file_name_WZ),
                                           sheet = "SVB - Tabelle I",trim_ws = T, col_names=F ))
  HO_WZ_import=HO_WZ_import[which(regexpr(pattern='^[[:digit:]][[:digit:]] \\w',text = HO_WZ_import$...1)>0),1:2]
  colnames(HO_WZ_import)=c("NACR2","employees")
  HO_WZ_import$employees=as.numeric(HO_WZ_import$employees)
  
  HO_dict_import=suppressMessages(read_excel(path = paste0(path_to_HO,file_dictionary),
                                             sheet = "Abteilungen zu Abschnitt",trim_ws = T, col_names=F )) %>% na.omit()
  HO_dict_import=HO_dict_import[which(regexpr(pattern='^[A-U]',text = HO_dict_import$...1)>0),1:2]
  HO_dict=data.frame()
  for(i in 1:nrow(HO_dict_import)){
    lower=as.integer(substr(HO_dict_import[i,2],1,2))
    higher=ifelse(str_length(HO_dict_import[i,2])==2,lower,substr(HO_dict_import[i,2],4,5))
    HO_dict=bind_rows(HO_dict,data.frame(HO_dict_import[i,1],lower:higher))
  }
  colnames(HO_dict)=c("WZ","NACR2")
  HO_data=bind_cols(HO_dict, HO_import[-1,-1], HO_WZ_import[,-1])
  
  HO_data=HO_data %>%
    group_by(WZ) %>% 
    summarise(HO_freq=sum(wfh_freq*employees)/sum(employees),
              HO_occ=sum(wfh_tot*employees)/sum(employees),
              HO_untapped=sum(wfh_feas*employees)/sum(employees))
  
  
  #Generate Index
  
  for(i in 1:nrow(municipality_data_TI)){
    x=municipality_data_TI[i,]
    for(j in 1:number_of_month){
      year_month=(j-1)%%12+1
      year_month=paste(20+floor((j-1)/12),ifelse(year_month<10,paste0(0,year_month),year_month),sep="_")
      municipality_data_TI[i,paste0("KA_index_",year_month)]=sum(x[paste0("ft_empl_wp_share_WZ_",WZ_string)]*
                                                                   x[paste0("KA_share_",year_month,"_WZ_",WZ_string)], na.rm = T)
    }
    municipality_data_TI[i,"HO_freq_index"]=sum(x[paste0("ft_empl_wp_share_WZ_",WZ_string)]* HO_data$HO_freq/100, na.rm=T)
    municipality_data_TI[i,"HO_occ_index"]=sum(x[paste0("ft_empl_wp_share_WZ_",WZ_string)]* HO_data$HO_occ/100, na.rm=T)
    municipality_data_TI[i,"HO_untapped_index"]=sum(x[paste0("ft_empl_wp_share_WZ_",WZ_string)]* HO_data$HO_untapped/100, na.rm=T)
  }
  
  KA_BL_colnames=colnames(municipality_data_TI)[grepl("^KA_total_BL", colnames(municipality_data_TI))]
  KA_colnames=colnames(municipality_data_TI)[grepl("^KA_index", colnames(municipality_data_TI))]
  
  municipality_data_TI$KA_index=diag(as.matrix(municipality_data_TI[,KA_colnames]) %*% 
                                       t(as.matrix(municipality_data_TI[,KA_BL_colnames])))/rowSums(municipality_data_TI[,KA_BL_colnames])
  
  # generate the monthly cumulative averages
  KA_index_long=pivot_longer(municipality_data_TI[,c("ags_9",KA_colnames)], cols=all_of(KA_colnames)) %>% 
    arrange(ags_9, name)
  
  KA_index_long=KA_index_long %>% 
    group_by(ags_9) %>% 
    mutate(KA_index_avg=cummean(value)) %>%
    ungroup() %>% 
    mutate(name=str_replace(name, "KA_index","KA_index_avg"))
  
  KA_index_long=pivot_wider(data = KA_index_long,
                            id_cols=c("ags_9"),
                            names_from="name",
                            values_from = KA_index_avg,
                            names_sort=T)
  municipality_data_TI=left_join(municipality_data_TI,
                                 KA_index_long,
                                 by="ags_9")
  
  municipality_data_TI = municipality_data_TI %>% select(-starts_with("KA_share_"),-starts_with("KA_total_BL_"),-ags_2)
  return(municipality_data_TI)
}

municipality_data=generate_treatment_index(municipality_data, ags_book)

# Generate estimation data set --------------------------------------------

# merge covid cases to data
estimation_data=left_join(municipality_data,
                          covid_data$Cumulative_cases_by_LK,
                          by="ags_5")

# generate log of population and income
estimation_data$log_population_total=log(estimation_data$population_total, 10)
estimation_data$log_income_pp=log(estimation_data$income_pp, 10)

# generate economic sectors
estimation_data=estimation_data %>% mutate(
  ft_empl_wp_share_sector_primary=ft_empl_wp_share_WZ_A,
  ft_empl_wp_share_sector_secondary= rowSums(select(., .dots = ft_empl_wp_share_WZ_B:ft_empl_wp_share_WZ_F)),
  ft_empl_wp_share_sector_tertiary= rowSums(select(., .dots = ft_empl_wp_share_WZ_G:ft_empl_wp_share_WZ_U)),
  ft_empl_wp_share_sector_tertiary_I_R= rowSums(select(., .dots =c("ft_empl_wp_share_WZ_I","ft_empl_wp_share_WZ_R"))),
  ft_empl_wp_share_sector_tertiary_K= rowSums(select(., .dots =c("ft_empl_wp_share_WZ_K"))),
  ft_empl_wp_share_sector_tertiary_J_M_N= rowSums(select(., .dots =c("ft_empl_wp_share_WZ_J","ft_empl_wp_share_WZ_M","ft_empl_wp_share_WZ_N"))),
  ft_empl_wp_share_sector_tertiary_I_R= rowSums(select(., .dots =c("ft_empl_wp_share_WZ_I","ft_empl_wp_share_WZ_R"))),
  ft_empl_wp_share_sector_tertiary_G_H_L_O_P_Q_S_T_U= rowSums(select(., .dots = c("ft_empl_wp_share_WZ_G","ft_empl_wp_share_WZ_H",
                                                                                  "ft_empl_wp_share_WZ_L","ft_empl_wp_share_WZ_O",
                                                                                  "ft_empl_wp_share_WZ_P","ft_empl_wp_share_WZ_Q",
                                                                                  "ft_empl_wp_share_WZ_S","ft_empl_wp_share_WZ_T",
                                                                                  "ft_empl_wp_share_WZ_U"))))

# Generate avg apt variables
estimation_data=estimation_data %>% mutate(avg_rooms_per_apt=apt_share_1_room+
                                  apt_share_2_rooms*2+
                                  apt_share_3_rooms*3+
                                  apt_share_4_rooms*4+
                                  apt_share_5_rooms*5+
                                  apt_share_6_rooms*6+
                                  apt_share_7_or_more_rooms*7,
                                avg_apt_size=apt_share_size_below_40*40+
                                  apt_share_size_40_59*50+
                                  apt_share_size_60_79*70+
                                  apt_share_size_80_99*90+
                                  apt_share_size_100_119*110+
                                  apt_share_size_120_139*130+
                                  apt_share_size_140_159*150+
                                  apt_share_size_160_179*170+
                                  apt_share_size_180_199*190+
                                  apt_share_size_over_200*200,
                                avg_apt_per_res_fac=res_fac_share_1_apt+
                                  res_fac_share_2_apt*2+
                                  res_fac_share_3_6_apt*4.5+
                                  res_fac_share_7_12_apt*9.5+
                                  res_fac_share_13_more_apt*13,
                                avg_people_per_household=hh_share_1_person+
                                  hh_share_2_person*2+
                                  hh_share_3_person*3+
                                  hh_share_4_person*4+
                                  hh_share_5_person*5+
                                  hh_share_6_or_more_person*6)

# owner share variables
estimation_data=estimation_data %>%
  mutate(apt_share_owned_others=apt_share_commune+apt_share_housing_coop+apt_share_NPO_owned+apt_share_owner_assoc,
         apt_share_owned_gov=apt_share_gov_owned,
         apt_share_owned_private_invest=apt_share_private_assoc+apt_share_private_enterpr,
         apt_share_owned_private_person=apt_share_private_owned
         
)
# Vote share
estimation_data = estimation_data %>%
  mutate(vote_share_left=vote_share_party_LINKE,
         vote_share_mid=vote_share_party_CDU_CSU+vote_share_party_SPD+vote_share_party_GRUNE+vote_share_party_FDP,
         vote_share_right=vote_share_party_AFD
)

# one gemeindeverband is removed because no price is available. 4161 are left.

treatment_covid_variable="Covid_cum_cases_share_20_12"
treatment_KA_variable="KA_index_avg_20_12"

# generate treatment group based on variable definition
estimation_data$Covid_dimension=cut_number(estimation_data[,treatment_covid_variable],n=3, labels=c("low","medium","high"))
estimation_data$Covid_dimension_num=as.numeric(estimation_data$Covid_dimension)
estimation_data$KA_dimension=cut_number(estimation_data[,treatment_KA_variable],n=3, labels=c("low","medium","high"))
estimation_data$KA_dimension_num=as.numeric(estimation_data$KA_dimension)

estimation_data=estimation_data %>% 
  mutate(Treatment_group=paste0("Covid_",Covid_dimension,"_KA_",KA_dimension))

estimation_data$Treatment_group=factor(estimation_data$Treatment_group,
                                       levels=c("Covid_low_KA_low", "Covid_low_KA_medium" ,"Covid_low_KA_high","Covid_medium_KA_low","Covid_medium_KA_medium", "Covid_medium_KA_high","Covid_high_KA_low","Covid_high_KA_medium","Covid_high_KA_high"),
                                       ordered=T)
estimation_data$Treatment_group_num=as.numeric(estimation_data$Treatment_group)

write.csv(estimation_data %>% 
            filter(ags_9 %in% unique(RE21_price_data$ags_9),
                   ags_9!="033589501"), 
          file = "./Estimation_data/estimation_data_2022_07_07.csv",
          row.names=F)


RE21_price_data_estimation=RE21_price_data %>% 
  filter(ags_9 %in% unique(estimation_data$ags_9),
         ags_9!="033589501") %>%
  mutate(house_share = ifelse(sector=="residential" & is.na(house_share),0,house_share)) %>%
  mutate(across(where(is.numeric),~round(.,5)))
write.csv(RE21_price_data_estimation, 
          file = "./Estimation_data/RE21_data_2022_07_07.csv",
          row.names=F)

# Generate descriptive stats table ------------------------------
desc_stats=left_join(estimation_data[,c('ags_9',
                                        'Covid_dimension','KA_dimension',
                                        'population_total', 
                                        'apt_share_owned_gov', 'apt_share_owned_others',
                                        'apt_share_owned_private_invest',
                                        'apt_share_owned_private_person', 'apt_total',
                                        'new_apt_total', 'new_non_res_fac_total',
                                        'new_res_fac_total', 'avg_apt_per_res_fac', 'avg_apt_size',
                                        'avg_people_per_household', 'empl_res_share_commute_out',
                                        'empl_wp_share_commute_in', 'empl_res_share_foreign',
                                        'empl_res_share_o55', 'empl_res_total', 'empl_wp_total',
                                        'ft_empl_wp_share_sector_primary',
                                        'ft_empl_wp_share_sector_secondary',
                                        'ft_empl_wp_share_sector_tertiary_G_H_L_O_P_Q_S_T_U',
                                        'ft_empl_wp_share_sector_tertiary_I_R',
                                        'ft_empl_wp_share_sector_tertiary_J_M_N',
                                        'ft_empl_wp_share_sector_tertiary_K',
                                        'hh_share_1_person', 'hh_share_dinks', 'hh_share_family',
                                        'hh_total',
                                        'HO_occ_index', 'log_income_pp', 'pop_avg_age',
                                        'pop_share_age_0_9', 'pop_share_age_10_19',
                                        'pop_share_citizen_EU27', 'pop_share_citizen_germany',
                                        'pop_share_inflow', 'pop_share_outflow',
                                        'area',
                                        'res_fac_share_13_more_apt',
                                        'res_fac_share_detached_house', 'res_fac_share_res_build',
                                        'res_fac_total', 'unempl_div_empl_res', 'unempl_res_total',
                                        'urbanization', 'vote_share_left', 'vote_share_mid',
                                        'vote_share_right','priv_alle_tech_200')] %>% 
                       mutate(log_income_pp=ifelse(is.infinite(log_income_pp),NA,log_income_pp)),
                     pivot_wider(RE21_price_data_estimation %>% 
                                   filter((year==2019 & quarter=="Q4") | (year==2021 & quarter=="Q1")),
                                 id_cols="ags_9",
                                 names_from="RE21_price_id",
                                 values_from="price"),
                     by="ags_9")

desc_stats=pivot_longer(desc_stats,
                  cols=apt_share_owned_gov:Q1_2021_sale_retail,
                  names_to = "variable",
                  values_to = "value")


desc_stats_covid = desc_stats %>%
  group_by(Covid_dimension, variable) %>% 
  summarise(aver=as.numeric(format(weighted.mean(value, population_total, na.rm=T), digits=2)),
            stdd=as.numeric(format(sd(value, na.rm=T),digits=2)),
            aver_n=weighted.mean(value, population_total, na.rm=T),
            stdd_n=sd(value, na.rm=T),
            .groups="drop") %>% 
  group_by(variable) %>%
  mutate(digits=max(match(TRUE, round(aver, 0:10) == aver),match(TRUE, round(stdd, 0:10) == stdd))-1) %>%
  ungroup() %>%
  mutate(aver=sprintf(paste0("%.",digits,"f"), round(aver_n, digits)),
         stdd=sprintf(paste0("%.",digits,"f"), round(stdd_n, digits))) %>%
  mutate(avg_sd=paste0(aver," (",stdd,")"))

desc_stats_covid=pivot_wider(desc_stats_covid,
                             id_cols="variable",
                             names_from="Covid_dimension",
                             values_from="avg_sd") %>% 
  mutate(row_text=paste(low, medium, high, sep=" & ")) %>% 
  select(variable, row_text)


desc_stats_KA = desc_stats %>%
  group_by(KA_dimension, variable) %>% 
  summarise(aver=as.numeric(format(weighted.mean(value, population_total, na.rm=T), digits=2)),
            stdd=as.numeric(format(sd(value, na.rm=T),digits=2)),
            aver_n=weighted.mean(value, population_total, na.rm=T),
            stdd_n=sd(value, na.rm=T),
            .groups="drop") %>% 
  group_by(variable) %>%
  mutate(digits=max(match(TRUE, round(aver, 0:10) == aver),match(TRUE, round(stdd, 0:10) == stdd))-1) %>%
  ungroup() %>%
  mutate(aver=sprintf(paste0("%.",digits,"f"), round(aver_n, digits)),
         stdd=sprintf(paste0("%.",digits,"f"), round(stdd_n, digits))) %>%
  mutate(avg_sd=paste0(aver," (",stdd,")"))

desc_stats_KA=pivot_wider(desc_stats_KA,
                             id_cols="variable",
                             names_from="KA_dimension",
                             values_from="avg_sd") %>% 
  mutate(row_text=paste(low, medium, high, sep=" & ")) %>% 
  select(variable, row_text)

desc_stats_both=merge(desc_stats_covid, desc_stats_KA, by="variable") %>% 
  mutate(row_text=paste(row_text.x, row_text.y, sep=" && ")) %>% 
  select(variable, row_text)      

write.csv(desc_stats_both,"./Analysis/desc_stats_both.csv", row.names = F)   


# Analysis - Treatment index ----------------------------------------------


#Plot on the Home Office index
shapefile_municipality=left_join(shapefile,
                              municipality_data %>% select(ags_9, HO_freq_index, starts_with("KA_index_")),
                              by="ags_9")

map_HO_index=theme_map + 
  geom_sf(data = shapefile_municipality, aes(fill=HO_freq_index), linewidth = NA, color = "black") + 
  scale_fill_viridis_c(option="viridis",
                       limits=quantile(municipality_data$HO_freq_index, 
                                       probs = c(0.05, 0.95), na.rm = T),
                       oob = scales::squish, 
                       direction = -1)+
  labs(fill="Share")+
  geom_sf_text(data=shapefile_municipality, aes(label = ABR, fontface=2), colour = "white", size=4)
   

ggsave(plot = map_HO_index, filename = paste0("./Analysis/HO_index.",plot_type), 
       device = plot_type,units = "cm",width =10, height = 10, dpi = 300)


#Plot on the Kurzarbeit index

KA_year_month="20_12"
KA_time=paste0("KA_index_avg_", KA_year_month)
shapefile_municipality$KA_index=shapefile_municipality %>% pull(KA_time)
map_KA_index=theme_map + 
  geom_sf(data = shapefile_municipality, aes(fill=KA_index), linewidth = NA, color = "black") + 
  scale_fill_viridis_c(option="viridis",
                       limits=quantile(shapefile_municipality$KA_index, 
                                       probs = c(0.05, 0.95), na.rm = T),
                       oob = scales::squish, 
                       direction = -1)+
  labs(fill="Share")+
  geom_sf_text(data=shapefile_municipality, aes(label = ABR, fontface=2), colour = "white", size=4)

ggsave(plot = map_KA_index, filename = paste0("./Analysis/KA_index.",plot_type), 
       device = plot_type,units = "cm",width =10, height = 10, dpi = 300)


KA_months=str_replace(str_replace(colnames(shapefile_municipality)[grepl('^KA_index_2',colnames(shapefile_municipality))],"KA_index_",""),"_bin","")

KA_limits=quantile(pivot_longer(shapefile_municipality[,paste0("KA_index_",KA_months)],
                                cols = paste0("KA_index_",KA_months))$value, 
                   probs = c(0.01, 0.99), na.rm = T)
for(month in KA_months){
  shapefile_municipality$KA_month =pull(shapefile_municipality,paste0("KA_index_",month))
  
  map_KA_index_grid_month=theme_map + 
    geom_sf(data = shapefile_municipality, aes(fill=KA_month), linewidth = NA, color = "black") + 
    scale_fill_viridis_c(option="viridis",
                         limits=quantile(shapefile_municipality$KA_index, 
                                         probs = c(0.05, 0.95), na.rm = T),
                         oob = scales::squish, 
                         direction = -1)+
    labs(fill="Share")+
    geom_sf_text(data=shapefile_municipality, aes(label = ABR, fontface=2), colour = "white", size=4)
  ggsave(plot = map_KA_index_grid_month, filename = paste0("./Analysis/KA_index_grid_",month,".",plot_type), 
         device = plot_type,units = "cm",width =10, height = 10, dpi = 300)
  
}

rm(KA_months, KA_limits, KA_time, KA_year_month)
rm(map_HO_index, map_KA_index, map_KA_index_grid_month)

# Analysis - RE21 Prices --------------------------------------------------

#Calculate price change from Q4/19 to Q1/21
shapefile_RE21=left_join(shapefile,
                         left_join(RE21_price_data %>% filter(quarter=="Q1", year==2021),
                                   RE21_price_data %>% filter(quarter=="Q4", year==2019),
                                   by=c("ags_9","type", "sector")) %>% 
                           mutate(price_change=(price.x/price.y-1)*100,
                                  price_posttreat=price.x,
                                  price_pretreat=price.y) %>% 
                           select(ags_9, type, sector, price_change, price_posttreat, price_pretreat),
                         by="ags_9")

#Calculate price to rents ratio in Q4/19 
shapefile_RE21=left_join(shapefile_RE21,
                         left_join(RE21_price_data %>% filter(quarter=="Q4", year==2019, type=="sale"),
                                   RE21_price_data %>% filter(quarter=="Q4", year==2019, type=="rent"),
                                   by=c("ags_9", "sector")) %>% 
                           mutate(price_ratio=price.x/price.y/12,
                                  sale_pretreat=price.x,
                                  rent_pretreat=price.y) %>% 
                           select(ags_9, sector, price_ratio, sale_pretreat, rent_pretreat),
                         by=c("ags_9","sector"))

price_types=c("rent","sale")
price_sectors=c("residential","office","retail")

for(current_sector in price_sectors){
  for(current_type in price_types){
    current_shapefile=shapefile_RE21 %>% filter(type==!!current_type, sector==!!current_sector)

    #Plot on the RE21 pretreatment prices
    map_RE21_price=theme_map + 
      geom_sf(data = current_shapefile, aes(fill=price_pretreat), linewidth = NA, color = "black") + 
      scale_fill_viridis_c(option="viridis",
                           limits=quantile(current_shapefile$price_pretreat, 
                                           probs = c(0.05, 0.95), na.rm = T),
                           oob = scales::squish, 
                           direction = -1)+
      geom_sf_text(data=current_shapefile, aes(label = ABR, fontface=2), colour = "white", size=4)
    if(current_type=="rent"){
      map_RE21_price=map_RE21_price +
        labs(fill="Avg. m2\nrent")
    } else {
      map_RE21_price=map_RE21_price +
        labs(fill="Avg. m2\nprice")
    }
    ggsave(plot = map_RE21_price, filename = paste0("./Analysis/RE21_",current_sector,"_",current_type,"_Q4_2019.",plot_type),
           device = plot_type,units = "cm",width =10, height = 10, dpi = 300)
    
    
    # price change in binned version
    map_RE21_price_change=theme_map + 
      geom_sf(data = current_shapefile, aes(fill=price_change), linewidth = NA, color = "black") + 
      scale_fill_viridis_c(option="viridis",
                           limits=quantile(shapefile_RE21 %>% filter(type==!!current_type) %>% pull(price_change), 
                                           probs = c(0.05, 0.95), na.rm = T),
                           oob = scales::squish, 
                           direction = -1)+
      geom_sf_text(data=current_shapefile, aes(label = ABR, fontface=2), colour = "white", size=4)
    if(current_type=="rent"){
      map_RE21_price_change=map_RE21_price_change +
        labs(fill="Rent\nchange [%]")
    } else {
      map_RE21_price_change=map_RE21_price_change +
        labs(fill="Price\nchange [%]")
    }
    ggsave(plot = map_RE21_price_change, filename = paste0("./Analysis/RE21_",current_sector,"_",current_type,"_price_change.",plot_type), 
           device = plot_type,units = "cm",width =10, height = 10, dpi = 300)
  }
  
  # Ratio between sales price and annual rents
  map_RE21_price_ratio=theme_map + 
    geom_sf(data = current_shapefile, aes(fill=price_ratio), linewidth = NA, color = "black") + 
    scale_fill_viridis_c(option="viridis",
                         limits=quantile(shapefile_RE21 %>% filter(type==!!current_type) %>% pull(price_ratio), 
                                         probs = c(0.05, 0.95), na.rm = T),
                         oob = scales::squish, 
                         direction = -1)+
    labs(fill="Price-rent\nratio")+
    geom_sf_text(data=current_shapefile, aes(label = ABR, fontface=2), colour = "white", size=4)
  
  ggsave(plot = map_RE21_price_ratio, filename = paste0("./Analysis/RE21_",current_sector,"_price_ratio.",plot_type), 
         device = plot_type,units = "cm",width =10, height = 10, dpi = 300)
}
# 
# cor(shapefile_RE21 %>% filter(sector=="residential",type=="rent") %>% pull(price_pretreat) %>% na.omit(),
#     shapefile_RE21 %>% filter(sector=="residential",type=="rent") %>% pull(price_change) %>% na.omit())
# cor(shapefile_RE21 %>% filter(sector=="retail",type=="rent") %>% pull(price_pretreat) %>% na.omit(),
#     shapefile_RE21 %>% filter(sector=="retail",type=="rent") %>% pull(price_change) %>% na.omit())
# cor(shapefile_RE21 %>% filter(sector=="office",type=="rent") %>% pull(price_pretreat) %>% na.omit(),
#     shapefile_RE21 %>% filter(sector=="office",type=="rent") %>% pull(price_change) %>% na.omit())
# 
# cor(shapefile_RE21 %>% filter(sector=="residential",type=="sale") %>% pull(price_pretreat) %>% na.omit(),
#     shapefile_RE21 %>% filter(sector=="residential",type=="sale") %>% pull(price_change) %>% na.omit())
# cor(shapefile_RE21 %>% filter(sector=="retail",type=="sale") %>% pull(price_pretreat) %>% na.omit(),
#     shapefile_RE21 %>% filter(sector=="retail",type=="sale") %>% pull(price_change) %>% na.omit())
# cor(shapefile_RE21 %>% filter(sector=="office",type=="sale") %>% pull(price_pretreat) %>% na.omit(),
#     shapefile_RE21 %>% filter(sector=="office",type=="sale") %>% pull(price_change) %>% na.omit())

shapefile_RE21 %>%
  filter(type=="rent") %>% 
  select(-geometry) %>% 
  group_by(sector, type) %>% 
  summarise(price_ratio=sum(sale_pretreat*Population, na.rm = T)/sum(rent_pretreat*Population, na.rm = T)/12, .groups="drop")

shapefile_RE21 %>%
  filter(type=="rent") %>% 
  select(-geometry) %>% 
  group_by(sector, type) %>% 
  summarise(price_ratio_1=sum(sale_pretreat, na.rm = T)/sum(rent_pretreat, na.rm = T)/12,
            price_ratio_2=mean(price_ratio), .groups="drop")

rm(price_types, price_sectors)
rm(map_RE21_price)
rm(current_sector, current_type)

# Analysis - Covid Cases evolution ------------------------------------------

shapefile_covid=left_join(shapefile,
                          ags_book %>% select(ags_9, ags_5) %>% unique(),
                          by=c("ags_9"))


for(month in c(paste0("20_0",1:9),paste0("20_",10:12),paste0("21_0",1:8))){
  current_month_data=left_join(shapefile_covid,
                               covid_data$Cases_by_LK_Month %>% filter(Year_Month==month),
                               by="ags_5")
  cases_share_month=theme_map + 
    geom_sf(data = current_month_data, aes(fill=Cases_share), linewidth = NA, color = "black") + 
    scale_fill_viridis_c(option="viridis",
                         limits=quantile(covid_data$Cases_by_LK_Month$Cases_share, 
                                         probs = c(0.05, 0.95), na.rm = T),
                         oob = scales::squish, 
                         direction = -1)+
    labs(fill="Share")+
    geom_sf_text(data=current_month_data, aes(label = ABR, fontface=2), colour = "white", size=4)
  

  ggsave(plot = cases_share_month, 
         filename = paste0("./Analysis/Cases_share_",month,".",plot_type), 
         device = plot_type,units = "cm",width =10, height = 10, dpi = 300)
  
  cases_share_month_variable=theme_map + 
    geom_sf(data = current_month_data, aes(fill=Cases_share), linewidth = NA, color = "black") + 
    scale_fill_viridis_c(option="viridis",
                         limits=quantile(current_month_data$Cases_share, 
                                         probs = c(0.05, 0.95), na.rm = T),
                         oob = scales::squish, 
                         direction = -1)+
    labs(fill="Share")+
    geom_sf_text(data=current_month_data, aes(label = ABR, fontface=2), colour = "white", size=4)
  
  ggsave(plot = cases_share_month_variable, 
         filename = paste0("./Analysis/monthly_colorscale/Cases_share_",month,".",plot_type), 
         device = plot_type,units = "cm",width =10, height = 10, dpi = 300)
  
  
}
cases_share_month_until=function(ym_vector, last_month_string){
  current_month_data=left_join(shapefile_covid,
                               covid_data$Cases_by_LK_Month %>% filter(Year_Month %in% ym_vector) %>%
                                 group_by(ags_5) %>% 
                                 summarise(Cases_share=sum(Cases_share), .groups="drop"),
                               by="ags_5")
  cases_share_month_until=theme_map + 
    geom_sf(data = current_month_data, aes(fill=Cases_share), linewidth = NA, color = "black") + 
    scale_fill_viridis_c(option="viridis",
                         limits=quantile(current_month_data$Cases_share, 
                                         probs = c(0.05, 0.95), na.rm = T),
                         oob = scales::squish, 
                         direction = -1)+
    labs(fill="Share")+
    geom_sf_text(data=current_month_data, aes(label = ABR, fontface=2), colour = "white", size=4)
  
  ggsave(plot = cases_share_month_until, filename = paste0("./Analysis/Cases_share_until_",last_month_string,".",plot_type), 
         device = plot_type,units = "cm",width = 10, height = 10, dpi = 300)
}

cases_share_month_until(ym_vector=c(paste0("20_0",1:5)),
                        last_month_string="20_05")
cases_share_month_until(ym_vector=c(paste0("20_0",1:9),paste0("20_",10)),
                        last_month_string="20_10")
cases_share_month_until(ym_vector=c(paste0("20_0",1:9),paste0("20_",10:12)),
                        last_month_string="20_12")
cases_share_month_until(ym_vector=c(paste0("20_0",1:9),paste0("20_",10:12),paste0("21_0",1:2)),
                        last_month_string="21_02")
cases_share_month_until(ym_vector=c(paste0("20_0",1:9),paste0("20_",10:12),paste0("21_0",1:8)),
                        last_month_string="21_08")

rm(month, cases_share_month_until, cases_share_month, cases_share_month_variable)


# Analysis - Mobilfunk Daten ----------------------------------------------

shapefile_broadband=left_join(shapefile,
                              broadband_data %>% select(ags_9, priv_alle_tech_200),
                              by="ags_9") %>%
  mutate(across(starts_with("priv_alle"),~replace_na(.,0)))

map_broadband=theme_map + 
  geom_sf(data = shapefile_broadband, aes(fill=priv_alle_tech_200), linewidth = NA, color = "black") + 
  scale_fill_viridis_c(option="viridis",
                        limits=quantile(shapefile_broadband$priv_alle_tech_200, 
                                        probs = c(0.05, 1), na.rm = T),
                        oob = scales::squish, 
                        direction = -1)+
  labs(fill="Broadband\ncoverage [%]")+
  geom_sf_text(data=shapefile_broadband, aes(label = ABR, fontface=2), colour = "white", size=4)


ggsave(plot = map_broadband, filename = paste0("./Analysis/Broadband_coverage.",plot_type), 
               device = plot_type,units = "cm",width =10, height = 10, dpi = 300)
       

